/*
 * Decompiled with CFR 0.152.
 */
package org.jpedal.examples.text;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Iterator;
import java.util.Vector;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Strip;

public class ExtractTextAsWordlist {
    public static boolean outputMessages = true;
    private int wordsExtracted = 0;
    private String user_dir = System.getProperty("user.dir");
    String separator = System.getProperty("file.separator");
    PdfDecoder decodePdf = null;
    private boolean isFile = true;
    private byte[] byteArray = null;
    public static boolean isTest = false;

    public ExtractTextAsWordlist() {
    }

    public ExtractTextAsWordlist(String file_name) {
        if (outputMessages) {
            System.out.println("processing " + file_name);
        }
        if (!this.user_dir.endsWith(this.separator)) {
            this.user_dir = String.valueOf(this.user_dir) + this.separator;
        }
        if (file_name.toLowerCase().endsWith(".pdf")) {
            this.decodeFile(file_name);
        } else {
            String[] files = null;
            File inputFiles = null;
            if (!file_name.endsWith(this.separator)) {
                file_name = String.valueOf(file_name) + this.separator;
            }
            try {
                inputFiles = new File(file_name);
                if (!inputFiles.isDirectory()) {
                    System.err.println(String.valueOf(file_name) + " is not a directory. Exiting program");
                }
                files = inputFiles.list();
            }
            catch (Exception ee) {
                LogWriter.writeLog("Exception trying to access file " + ee.getMessage());
            }
            long fileCount = files.length;
            int i = 0;
            while ((long)i < fileCount) {
                if (files[i].toLowerCase().endsWith(".pdf")) {
                    if (outputMessages) {
                        System.out.println(String.valueOf(file_name) + files[i]);
                    }
                    this.decodeFile(String.valueOf(file_name) + files[i]);
                }
                ++i;
            }
        }
    }

    public ExtractTextAsWordlist(byte[] array) {
        if (outputMessages) {
            System.out.println("processing byte array");
        }
        if (!this.user_dir.endsWith(this.separator)) {
            this.user_dir = String.valueOf(this.user_dir) + this.separator;
        }
        this.byteArray = array;
        this.isFile = false;
        this.decodeFile("byteArray");
    }

    private void decodeFile(String file_name) {
        PdfDecoder.useTextExtraction();
        String name = "demo";
        int pointer = file_name.lastIndexOf(this.separator);
        if (pointer != -1) {
            name = file_name.substring(pointer + 1, file_name.length() - 4);
        }
        String outputDir = String.valueOf(this.user_dir) + "text" + this.separator + name + this.separator;
        try {
            this.decodePdf = new PdfDecoder(false);
            this.decodePdf.setExtractionMode(1);
            this.decodePdf.init(true);
            PdfGroupingAlgorithms.useUnrotatedCoords = false;
            if (outputMessages) {
                System.out.println("Opening file :" + file_name);
            }
            if (this.isFile) {
                this.decodePdf.openPdfFile(file_name);
            } else {
                this.decodePdf.openPdfArray(this.byteArray);
            }
        }
        catch (PdfSecurityException e) {
            System.err.println("Exception " + e + " in pdf code for wordlist" + file_name);
        }
        catch (PdfException e) {
            System.err.println("Exception " + e + " in pdf code for wordlist" + file_name);
        }
        catch (Exception e) {
            System.err.println("Exception " + e + " in pdf code for wordlist" + file_name);
            e.printStackTrace();
        }
        if (!this.decodePdf.isExtractionAllowed()) {
            if (outputMessages) {
                System.out.println("Text extraction not allowed");
            }
        } else if (this.decodePdf.isEncrypted() && !this.decodePdf.isPasswordSupplied()) {
            if (outputMessages) {
                System.out.println("Encrypted settings");
                System.out.println("Please look at SimpleViewer for code sample to handle such files");
            }
        } else {
            int start = 1;
            int end = this.decodePdf.getPageCount();
            if (end > 10 && isTest) {
                end = 10;
            }
            try {
                int page = start;
                while (page < end + 1) {
                    this.decodePdf.decodePage(page);
                    PdfGroupingAlgorithms currentGrouping = this.decodePdf.getGroupingObject();
                    PdfPageData currentPageData = this.decodePdf.getPdfPageData();
                    int x1 = currentPageData.getMediaBoxX(page);
                    int x2 = currentPageData.getMediaBoxWidth(page) + x1;
                    int y2 = currentPageData.getMediaBoxX(page);
                    int y1 = currentPageData.getMediaBoxHeight(page) - y2;
                    if (outputMessages) {
                        System.out.println("Page " + page + " Extracting text from rectangle (" + x1 + ',' + y1 + ' ' + x2 + ',' + y2 + ')');
                    }
                    Vector words = null;
                    try {
                        words = currentGrouping.extractTextAsWordlist(x1, y1, x2, y2, page, false, true, "&:=()!;.,\\/\"\"''");
                    }
                    catch (PdfException e) {
                        this.decodePdf.closePdfFile();
                        System.err.println("Exception= " + e + " in " + file_name);
                    }
                    if (words == null) {
                        if (outputMessages) {
                            System.out.println("No text found");
                        }
                    } else {
                        File output_path = new File(outputDir);
                        if (!output_path.exists()) {
                            output_path.mkdirs();
                        }
                        int wordCount = words.size() / 5;
                        this.wordsExtracted += wordCount;
                        if (outputMessages) {
                            System.out.println("Page contains " + wordCount + " words.");
                        }
                        if (outputMessages) {
                            System.out.println("Writing to " + outputDir + "words-" + page + ".txt");
                        }
                        OutputStreamWriter output_stream = new OutputStreamWriter((OutputStream)new FileOutputStream(String.valueOf(outputDir) + "words-" + page + ".txt"), "UTF-8");
                        Iterator wordIterator = words.iterator();
                        while (wordIterator.hasNext()) {
                            String currentWord = (String)wordIterator.next();
                            currentWord = Strip.convertToText(currentWord);
                            int wx1 = (int)Float.parseFloat((String)wordIterator.next());
                            int wy1 = (int)Float.parseFloat((String)wordIterator.next());
                            int wx2 = (int)Float.parseFloat((String)wordIterator.next());
                            int wy2 = (int)Float.parseFloat((String)wordIterator.next());
                            output_stream.write(String.valueOf(currentWord) + ',' + wx1 + ',' + wy1 + ',' + wx2 + ',' + wy2 + '\n');
                        }
                        output_stream.close();
                    }
                    this.decodePdf.flushObjectValues(false);
                    ++page;
                }
            }
            catch (Exception e) {
                this.decodePdf.closePdfFile();
                System.err.println("Exception " + e + " in " + file_name);
                e.printStackTrace();
            }
            this.decodePdf.flushObjectValues(true);
            if (outputMessages) {
                System.out.println("Text read");
            }
        }
        this.decodePdf.closePdfFile();
        this.decodePdf = null;
    }

    public static void main(String[] args) {
        File pdf_file;
        if (outputMessages) {
            System.out.println("Simple demo to extract text objects");
        }
        String file_name = "";
        if (args.length == 1) {
            file_name = args[0];
            if (outputMessages) {
                System.out.println("File :" + file_name);
            }
        } else {
            System.out.println("You must pass ONE parameter - a filename or directory in as a parameter");
            System.out.println("Make sure you put double quotes around the value if it has spaces");
            System.exit(1);
        }
        if (!(pdf_file = new File(file_name)).exists()) {
            System.out.println("File " + file_name + " not found");
        }
        new ExtractTextAsWordlist(file_name);
    }

    public int getWordsExtractedCount() {
        return this.wordsExtracted;
    }
}

