Tuesday, March 26, 2013

Search PDF for a given String : Java

 Third party jars used:
 itextpdf-5.3.3.jar

package com.pramod;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;

/**
 * @author PramodKumarGampa
 *
 *         This program reads list of PDF files available in the directory
 *         specified through "absolutePathToDir" and shows the occurences of
 *         search string specified through "searchString"
 *
 */
public class FindStringInPDF {

    public static void main(String[] args) throws IOException {

        String absolutePathToDir = "C:\\PDFS\\";
        String searchString = "ABC";
        showSearchResults(absolutePathToDir, searchString);
        System.out.println("End of the program");

    }

    public static void showSearchResults(String absolutePathToDir,
            String searchString) throws IOException {
        PdfReader reader;
        File file = new File(absolutePathToDir);

        String[] listOfFiles = file.list(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return name.endsWith("pdf");
            }

        });

        for (String fileName : listOfFiles) {
            reader = new PdfReader(absolutePathToDir + fileName);
            int noOfPages = reader.getNumberOfPages();
            String pageText = null;
            for (int i = 1; i < noOfPages; i++) {
                pageText = PdfTextExtractor.getTextFromPage(reader, i);
                if (pageText.contains(searchString)) {
                    System.out.println("Search string '" + searchString
                            + "' found in file: " + fileName);
                    break;
                }

            }

        }

    }

}