Handling PDF in Java With Apache PDFBox

Apache PDFBox is a Java tool for working with PDF documents. In this post, we’ll introduce how to use Apache PDFBox to handle PDF files. The code examples in this post are based on pdfbox v2.0.29.

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>

Extract Text

Extract all page text

String inputFilePath = "your/pdf/filepath";
// Load PDF document
PDDocument document = PDDocument.load(new File(inputFilePath));
// Create PDFTextStripper instance
PDFTextStripper pdfStripper = new PDFTextStripper();
// Extract text from PDF
String text = pdfStripper.getText(document);
// Print extracted text
System.out.println(text);
// Close the document
document.close();

Extract page by page

String inputFilePath = "your/pdf/filepath";
// Load the PDF document
PDDocument document = PDDocument.load(new File(inputFilePath));
// Create an instance of PDFTextStripper
PDFTextStripper stripper = new PDFTextStripper();
// Iterate through each page and extract the text
for (int pageNumber = 1; pageNumber <= document.getNumberOfPages(); pageNumber++) {
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);

String text = stripper.getText(document);
System.out.println("Page " + pageNumber + ":");
System.out.println(text);
}
// Close the PDF document
document.close();

Split and Merge

Split

private static void splitPdf(String inputFilePath, String outputDir) throws IOException {
File file = new File(inputFilePath);
// Load the PDF document
PDDocument document = PDDocument.load(file);
// Create a PDF splitter object
Splitter splitter = new Splitter();
// Split the document
List<PDDocument> splitDocuments = splitter.split(document);
// Get an iterator for the split documents
Iterator<PDDocument> iterator = splitDocuments.iterator();
// Iterate through the split documents and save them
int i = 1;
while (iterator.hasNext()) {
PDDocument splitDocument = iterator.next();
String outputFilePath = new StringBuilder().append(outputDir)
.append(File.separator)
.append(file.getName().replaceAll("[.](pdf|PDF)", ""))
.append("_split_")
.append(i)
.append(".pdf")
.toString();
splitDocument.save(outputFilePath);
splitDocument.close();
i++;
}
// Close the source document
document.close();
System.out.println("PDF split successfully!");
}

Merge PDF files

private static void mergePdfFiles(List<String> inputFilePaths, String outputFilePath) throws IOException {
PDFMergerUtility merger = new PDFMergerUtility();
// Add as many files as you need
for (String inputFilePath : inputFilePaths) {
merger.addSource(new File(inputFilePath));
}
merger.setDestinationFileName(outputFilePath);
merger.mergeDocuments();
System.out.println("PDF files merged successfully!");
}

Insert and remove pages

Insert pages

public static void insertPage(String sourceFile, String targetFile, int pageIndex) throws IOException {
// Load the existing PDF document
PDDocument sourceDoc = PDDocument.load(new File(sourceFile));
Integer sourcePageCount = sourceDoc.getNumberOfPages();
// Validate the requested page index
if (pageIndex < 0 || pageIndex > sourcePageCount) {
throw new IllegalArgumentException("Invalid page index");
}
// Create a new blank page
PDPage newPage = new PDPage();
// Insert the new page at the requested index
if (sourcePageCount.equals(pageIndex)) {
sourceDoc.getPages().add(newPage);
} else {
sourceDoc.getPages().insertBefore(newPage, sourceDoc.getPages().get(pageIndex));
}
// Save the modified PDF document to a target file
sourceDoc.save(targetFile);
// Close the source and target documents
sourceDoc.close();
}

Remove pages

private static void removePage(String inputFilePath, String outputFilePath, int pageIndex) throws IOException {

PDDocument sourceDoc = PDDocument.load(new File(inputFilePath));
Integer sourcePageCount = sourceDoc.getNumberOfPages();
// Validate the requested page index
if (pageIndex < 0 || pageIndex >= sourcePageCount) {
throw new IllegalArgumentException("Invalid page index");
}
sourceDoc.getPages().remove(pageIndex);
sourceDoc.save(outputFilePath);
sourceDoc.close();
}
private static void removePage2(String inputFilePath, String outputFilePath, int pageIndex) throws IOException {
PDDocument sourceDoc = PDDocument.load(new File(inputFilePath));
Integer sourcePageCount = sourceDoc.getNumberOfPages();
// Validate the requested page index
if (pageIndex < 0 || pageIndex >= sourcePageCount) {
throw new IllegalArgumentException("Invalid page index");
}
Splitter splitter = new Splitter();
List<PDDocument> pages = splitter.split(sourceDoc);
pages.remove(pageIndex);
PDDocument outputDocument = new PDDocument();
for (PDDocument page : pages) {
outputDocument.addPage(page.getPage(0));
}
outputDocument.save(outputFilePath);
sourceDoc.close();
outputDocument.close();
}

Encryption

Encrypt

public static void encryptPdf(String inputFilePath, String outputFilePath, String password) throws IOException {
PDDocument doc = PDDocument.load(new File(inputFilePath));

AccessPermission ap = new AccessPermission();
// disable printing,
ap.setCanPrint(false);
//disable copying
ap.setCanExtractContent(false);
//Disable other things if needed...

// Owner password (to open the file with all permissions)
// User password (to open the file but with restricted permissions)
StandardProtectionPolicy spp = new StandardProtectionPolicy(password, password, ap);
// Define the length of the encryption key.
// Possible values are 40, 128 or 256.
int keyLength = 256;
spp.setEncryptionKeyLength(keyLength);

//Apply protection
doc.protect(spp);

doc.save(outputFilePath);
doc.close();
}

Update password

public static void updatePdfPassword(String inputFilePath, String outputFilePath,
String oldPassword, String newPassword) throws IOException {
PDDocument doc = PDDocument.load(new File(inputFilePath), oldPassword);

AccessPermission ap = new AccessPermission();
// disable printing,
ap.setCanPrint(false);
//disable copying
ap.setCanExtractContent(false);
//Disable other things if needed...

// Owner password (to open the file with all permissions)
// User password (to open the file but with restricted permissions)
StandardProtectionPolicy spp = new StandardProtectionPolicy(newPassword, newPassword, ap);
// Define the length of the encryption key.
// Possible values are 40, 128 or 256.
int keyLength = 256;
spp.setEncryptionKeyLength(keyLength);

//Apply protection
doc.protect(spp);

doc.save(outputFilePath);
doc.close();
}

Remove password

public static void removePdfPassword(String inputFilePath, String outputFilePath,
String password) throws IOException {
PDDocument doc = PDDocument.load(new File(inputFilePath), password);
// Set the document access permissions
doc.setAllSecurityToBeRemoved(true);
// Save the unprotected PDF document
doc.save(outputFilePath);
// Close the document
doc.close();
}

Convert to Image

PDF to Image

public static void pdfToImage(String pdfFilePath, String imageFileDir) throws IOException {
File file = new File(pdfFilePath);
PDDocument document = PDDocument.load(file);
// Create PDFRenderer object to render each page as an image
PDFRenderer pdfRenderer = new PDFRenderer(document);
// Iterate over all the pages and convert each page to an image
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
// Render the page as an image
// 100 DPI: general-quality
// 300 DPI: high-quality
// 600 DPI: pristine-quality
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 300);
// Save the image to a file
String imageFilePath = new StringBuilder()
.append(imageFileDir)
.append(File.separator)
.append(file.getName().replaceAll("[.](pdf|PDF)", ""))
.append("_")
.append(pageIndex + 1)
.append(".png")
.toString();
ImageIO.write(image, "PNG", new File(imageFilePath));
}
// Close the document
document.close();
}

Image to PDF

private static void imageToPdf(String imagePath, String pdfPath) throws IOException {
try (PDDocument doc = new PDDocument()) {
PDPage page = new PDPage();
doc.addPage(page);
// createFromFile is the easiest way with an image file
// if you already have the image in a BufferedImage,
// call LosslessFactory.createFromImage() instead
PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath, doc);
// draw the image at full size at (x=0, y=0)
try (PDPageContentStream contents = new PDPageContentStream(doc, page)) {
// to draw the image at PDF width
int scaledWidth = 600;
if (pdImage.getWidth() < 600) {
scaledWidth = pdImage.getWidth();
}
contents.drawImage(pdImage, 0, 0, scaledWidth, pdImage.getHeight() * scaledWidth / pdImage.getWidth());
}
doc.save(pdfPath);
}
}

Create PDFs

String outputFilePath = "output/pdf/filepath";

PDDocument document = new PDDocument();
PDPage page = new PDPage(PDRectangle.A4);
document.addPage(page);
// Create content stream to draw on the page
PDPageContentStream contentStream = new PDPageContentStream(document, page);
contentStream.setFont(PDType1Font.HELVETICA, 12);
// Insert text
contentStream.beginText();
contentStream.newLineAtOffset(100, 700);
contentStream.showText("Hello, World!");
contentStream.endText();
// Load the image
String imageFilePath = "C:\\Users\\Taogen\\Pictures\\icon.jpg";
PDImageXObject image = PDImageXObject.createFromFile(imageFilePath, document);
// Set the scale and position of the image on the page
float scale = 0.5f; // adjust the scale as needed
float x = 100; // x-coordinate of the image
float y = 500; // y-coordinate of the image
// Draw the image on the page
contentStream.drawImage(image, x, y, image.getWidth() * scale, image.getHeight() * scale);
contentStream.close();
document.save(outputFilePath);
document.close();

Compress (TODO)

Watermark (Todo)