commit 1ff12076195ded417fbb94f73e6ede39bd647c41 Author: juyung Date: Sun Oct 26 19:15:38 2014 -0700 Archive diff --git a/build.xml b/build.xml new file mode 100755 index 0000000..5a5b856 --- /dev/null +++ b/build.xml @@ -0,0 +1,73 @@ + + + + + + + + + + + Builds, tests, and runs the project WordAnalyzer. + + + diff --git a/manifest.mf b/manifest.mf new file mode 100755 index 0000000..1574df4 --- /dev/null +++ b/manifest.mf @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +X-COMMENT: Main-Class will be added automatically by build + diff --git a/nbproject/build-impl.xml b/nbproject/build-impl.xml new file mode 100755 index 0000000..e27bb05 --- /dev/null +++ b/nbproject/build-impl.xml @@ -0,0 +1,1413 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set src.dir + Must set test.src.dir + Must set build.dir + Must set dist.dir + Must set build.classes.dir + Must set dist.javadoc.dir + Must set build.test.classes.dir + Must set build.test.results.dir + Must set build.classes.excludes + Must set dist.jar + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set javac.includes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + No tests executed. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set JVM to use for profiling in profiler.info.jvm + Must set profiler agent JVM arguments in profiler.info.jvmargs.agent + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select some files in the IDE or set javac.includes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + To run this application from the command line without Ant, try: + + java -jar "${dist.jar.resolved}" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set run.class + + + + Must select one file in the IDE or set run.class + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set debug.class + + + + + Must select one file in the IDE or set debug.class + + + + + Must set fix.includes + + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + Must select one file in the IDE or set profile.class + This target only works when run from inside the NetBeans IDE. + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set run.class + + + + + + Must select some files in the IDE or set test.includes + + + + + Must select one file in the IDE or set run.class + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select some files in the IDE or set javac.includes + + + + + + + + + + + + + + + + + + + + Some tests failed; see details above. + + + + + + + + + Must select some files in the IDE or set test.includes + + + + Some tests failed; see details above. + + + + Must select some files in the IDE or set test.class + Must select some method in the IDE or set test.method + + + + Some tests failed; see details above. + + + + + Must select one file in the IDE or set test.class + + + + Must select one file in the IDE or set test.class + Must select some method in the IDE or set test.method + + + + + + + + + + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/nbproject/genfiles.properties b/nbproject/genfiles.properties new file mode 100755 index 0000000..c58039b --- /dev/null +++ b/nbproject/genfiles.properties @@ -0,0 +1,8 @@ +build.xml.data.CRC32=49261115 +build.xml.script.CRC32=7bcfe82a +build.xml.stylesheet.CRC32=8064a381@1.75.1.48 +# This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml. +# Do not edit this file. You may delete it but then the IDE will never regenerate such files for you. +nbproject/build-impl.xml.data.CRC32=49261115 +nbproject/build-impl.xml.script.CRC32=9b562341 +nbproject/build-impl.xml.stylesheet.CRC32=876e7a8f@1.75.1.48 diff --git a/nbproject/project.properties b/nbproject/project.properties new file mode 100755 index 0000000..8a942c4 --- /dev/null +++ b/nbproject/project.properties @@ -0,0 +1,91 @@ +annotation.processing.enabled=true +annotation.processing.enabled.in.editor=false +annotation.processing.processors.list= +annotation.processing.run.all.processors=true +annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output +application.title=WordAnalyzer +application.vendor=i +build.classes.dir=${build.dir}/classes +build.classes.excludes=**/*.java,**/*.form +# This directory is removed when the project is cleaned: +build.dir=build +build.generated.dir=${build.dir}/generated +build.generated.sources.dir=${build.dir}/generated-sources +# Only compile against the classpath explicitly listed here: +build.sysclasspath=ignore +build.test.classes.dir=${build.dir}/test/classes +build.test.results.dir=${build.dir}/test/results +# Uncomment to specify the preferred debugger connection transport: +#debug.transport=dt_socket +debug.classpath=\ + ${run.classpath} +debug.test.classpath=\ + ${run.test.classpath} +# Files in build.classes.dir which should be excluded from distribution jar +dist.archive.excludes= +# This directory is removed when the project is cleaned: +dist.dir=dist +dist.jar=${dist.dir}/WordAnalyzer.jar +dist.javadoc.dir=${dist.dir}/javadoc +endorsed.classpath= +excludes= +file.reference.pdfbox-app-1.8.6.jar=C:\\Users\\i\\Documents\\NetBeansProjects\\pdfbox-app-1.8.6.jar +includes=** +jar.archive.disabled=${jnlp.enabled} +jar.compress=false +jar.index=${jnlp.enabled} +javac.classpath=\ + ${file.reference.pdfbox-app-1.8.6.jar} +# Space-separated list of extra javac options +javac.compilerargs= +javac.deprecation=false +javac.processorpath= +javac.source=1.8 +javac.target=1.8 +javac.test.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +javac.test.processorpath=\ + ${javac.test.classpath} +javadoc.additionalparam= +javadoc.author=false +javadoc.encoding=${source.encoding} +javadoc.noindex=false +javadoc.nonavbar=false +javadoc.notree=false +javadoc.private=false +javadoc.splitindex=true +javadoc.use=true +javadoc.version=false +javadoc.windowtitle= +jnlp.codebase.type=no.codebase +jnlp.descriptor=application +jnlp.enabled=false +jnlp.mixed.code=default +jnlp.offline-allowed=false +jnlp.signed=false +jnlp.signing= +jnlp.signing.alias= +jnlp.signing.keystore= +main.class=wordanalyzer.WordAnalyzer +# Optional override of default Codebase manifest attribute, use to prevent RIAs from being repurposed +manifest.custom.codebase= +# Optional override of default Permissions manifest attribute (supported values: sandbox, all-permissions) +manifest.custom.permissions= +manifest.file=manifest.mf +meta.inf.dir=${src.dir}/META-INF +mkdist.disabled=false +platform.active=default_platform +run.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +# Space-separated list of JVM arguments used when running the project. +# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. +# To set system properties for unit tests define test-sys-prop.name=value: +run.jvmargs= +run.test.classpath=\ + ${javac.test.classpath}:\ + ${build.test.classes.dir} +source.encoding=UTF-8 +src.dir=src +test.src.dir=test diff --git a/nbproject/project.xml b/nbproject/project.xml new file mode 100755 index 0000000..08dce56 --- /dev/null +++ b/nbproject/project.xml @@ -0,0 +1,15 @@ + + + org.netbeans.modules.java.j2seproject + + + WordAnalyzer + + + + + + + + + diff --git a/pdfbox-1.8.16.jar b/pdfbox-1.8.16.jar new file mode 100755 index 0000000..d1e4c85 Binary files /dev/null and b/pdfbox-1.8.16.jar differ diff --git a/src/wordanalyzer/WordAnalyzer.java b/src/wordanalyzer/WordAnalyzer.java new file mode 100755 index 0000000..56ab50a --- /dev/null +++ b/src/wordanalyzer/WordAnalyzer.java @@ -0,0 +1,340 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package wordanalyzer; +import java.util.*; +import java.io.*; +import java.awt.*; +import javax.swing.*; +import java.awt.event.*; +import java.util.concurrent.TimeUnit; +import org.apache.pdfbox.pdmodel.*; +import org.apache.pdfbox.util.*; + +/** + * + * @author i + */ +public class WordAnalyzer { + public static int MAX_DUPLICATION_COUNT = 0; + public static int MIN_DUPLICATION_COUNT = 0; + public static int MAX_FILE_COUNT = 0; + public static final String DIVIDING_LINE_SINGLE = "______________________________________________________________________________________\n"; + public static final String DIVIDING_LINE_DOUBLE = "======================================================================================\n"; + public static final String FOLDERPATH_HDR = "[FOLDER PATH] "; + public static final String ERR_FILENOTFOUND_MSG = "[ERROR] PDF FILE NOT FOUND"; + public static final String ERR_FILEENCRYPTED_MSG = "[ERROR] PDF FILE IS ENCRYPTED"; + public static final String ERR_FOLDERPATHNOTEXISTS_MSG = "[ERROR] FOLDER PATH NOT EXISTS"; + public static final String INTRODUCTION = "WELCOME TO WORD ANALYZER!\nAUTHOR: JUYOUNG LEE LAST UPDATED: 10-30-2014 VERSION: 1.0\n"; + public static Map map = new TreeMap<>(); + + public static int fileCount = 0; + public static String folderPath; + public static long startTime = 0, finishTime = 0; + + + //public static JLabel textLabel = new JLabel(); + public static JTextArea textArea = new JTextArea(14, 55); + public static JTextArea textAreaResultWindow = new JTextArea(8, 55); + public static JScrollPane scrollPane = new JScrollPane(textArea); + public static JScrollPane scrollPaneResultWindow = new JScrollPane(textAreaResultWindow); + public static JButton btnConfirmFolderPath = new JButton(" Confirm Folder Path "); + + /** + * @param args the command line arguments + */ + + public static void main(String[] args){ + window(); + + } + + public static void window(){ + Dimension screenSize = Toolkit.getDefaultToolkit().getScreenSize(); + double width = screenSize.getWidth(); + double height = screenSize.getHeight(); + + //Create and set up the window. + JFrame frame = new JFrame("Word Analyzer"); + JPanel panel = new JPanel(); + + JMenuBar menuBar = new JMenuBar(); + JMenu fileMenu = new JMenu("File"); + JMenu editMenu = new JMenu("Edit"); + JMenu viewMenu = new JMenu("View"); + JMenu toolsMenu = new JMenu("Tools"); + JMenu windowMenu = new JMenu("Window"); + JMenu helpMenu = new JMenu("Help"); + + JMenuItem newAction = new JMenuItem("New"); + JMenuItem openAction = new JMenuItem("Open"); + JMenuItem saveAction = new JMenuItem("Save"); + JMenuItem exitAction = new JMenuItem("Exit"); + + frame.setJMenuBar(menuBar); + menuBar.add(fileMenu); + menuBar.add(editMenu); + menuBar.add(viewMenu); + menuBar.add(toolsMenu); + menuBar.add(windowMenu); + menuBar.add(helpMenu); + fileMenu.add(newAction); + fileMenu.add(openAction); + fileMenu.add(saveAction); + fileMenu.add(exitAction); + + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + frame.setMinimumSize(new Dimension((int)(width * 0.5),(int)(height * 0.7))); + + //JLabel textLabel = new JLabel("This is a label in the window",SwingConstants.CENTER); + //frame.getContentPane().add(textLabel, BorderLayout.CENTER); + + //Display the window. + frame.setLocationRelativeTo(null); // centers a frame onscreen. + frame.pack(); + frame.setVisible(true); + frame.setResizable(false); + + JTextField txtboxFolderPath = new JTextField(34); + txtboxFolderPath.setMargin(new Insets(2, 2, 2, 2)); + txtboxFolderPath.setText("C:\\"); + + JLabel lblMaxDupCount = new JLabel("MAX. DUP. COUNT: "); + JTextField txtboxMaxDupCount = new JTextField(5); + txtboxMaxDupCount.setMargin(new Insets(2, 2, 2, 2)); + txtboxMaxDupCount.setText("0"); + JLabel lblMinDupCount = new JLabel("MIN. DUP. COUNT: "); + JTextField txtboxMinDupCount = new JTextField(5); + txtboxMinDupCount.setMargin(new Insets(2, 2, 2, 2)); + txtboxMinDupCount.setText("0"); + JLabel lblMaxFileCount = new JLabel("MAX. FILE COUNT (OPTIONAL): "); + JTextField txtboxMaxFileCount = new JTextField(5); + txtboxMaxFileCount.setMargin(new Insets(2, 2, 2, 2)); + //textLabel.setFont(new Font("Serif", Font.PLAIN, 14)); + //textLabel.setVerticalTextPosition(JLabel.BOTTOM); + //textLabel.setHorizontalTextPosition(JLabel.CENTER); + + //JLabel textLabel = new JLabel(); + + textArea.setEditable(false); + textArea.setLineWrap(true); + textArea.setWrapStyleWord(true); + textArea.setText(DIVIDING_LINE_SINGLE + INTRODUCTION + DIVIDING_LINE_DOUBLE); + textArea.setMargin(new Insets(10,10,10,10)); + scrollPane.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS); + + textAreaResultWindow.setEditable(false); + textAreaResultWindow.setLineWrap(true); + textAreaResultWindow.setWrapStyleWord(true); + textAreaResultWindow.setMargin(new Insets(10,10,10,10)); + scrollPaneResultWindow.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS); + + JButton btnStopScan = new JButton(" Stop Scan "); + btnStopScan.setEnabled(false); + + btnStopScan.addActionListener(new ActionListener() + { + @Override + public void actionPerformed(ActionEvent e) + { + btnStopScan.setEnabled(false); + btnConfirmFolderPath.setEnabled(true); + } + }); + + //btnConfirmFolderPath.setEnabled(true); + + btnConfirmFolderPath.addActionListener(new ActionListener() + { + @Override + public void actionPerformed(ActionEvent e) + { + String strMinDupCount = txtboxMinDupCount.getText(); + String strMaxDupCount = txtboxMaxDupCount.getText(); + String strMaxFileCount = txtboxMaxFileCount.getText(); + try{ + MIN_DUPLICATION_COUNT = Integer.parseInt(strMinDupCount); + } catch (Exception ex){ + MIN_DUPLICATION_COUNT = 0; + } + try{ + MAX_DUPLICATION_COUNT = Integer.parseInt(strMaxDupCount); + } catch (Exception ex){ + MAX_DUPLICATION_COUNT = 0; + } + try{ + MAX_FILE_COUNT = Integer.parseInt(strMaxFileCount); + } catch (Exception ex){ + MAX_FILE_COUNT = 1000000000; + } + btnConfirmFolderPath.setEnabled(false); + folderPath = txtboxFolderPath.getText(); + startTime = System.currentTimeMillis(); + textAreaResultWindow.append(DIVIDING_LINE_SINGLE + "[RESULT] DUP WORD (DUP TIMES)\n"); + scanner(); + btnConfirmFolderPath.setEnabled(true); + finishTime = System.currentTimeMillis(); + String exeTime = millisToShortDHMS(finishTime - startTime); + textArea.insert("[SCAN COMPLETED] " + folderPath +"\n" ,0); + textArea.append("[LOG] SCANNED FOLDER: " + folderPath + "\n[LOG] NUMBER OF FILES SCANNED: " + fileCount + "\n" + +"[LOG] TOTAL TIME: " + exeTime + "\n" + DIVIDING_LINE_SINGLE); + textArea.insert(DIVIDING_LINE_SINGLE, 0); + textArea.update(textArea.getGraphics()); + textArea.setCaretPosition(textArea.getDocument().getLength()); + } + }); + + + //panel.setLayout(new GridLayout(1, 1)); + + frame.add(panel); + panel.add(txtboxFolderPath); + panel.add(btnConfirmFolderPath); + panel.add(btnStopScan); + panel.add(lblMinDupCount); + panel.add(txtboxMinDupCount); + panel.add(lblMaxDupCount); + panel.add(txtboxMaxDupCount); + panel.add(lblMaxFileCount); + panel.add(txtboxMaxFileCount); + //panel.add(textLabel); + //panel.add(textArea); // I don't need to add textArea again into other container + panel.add(scrollPane); + panel.add(scrollPaneResultWindow); + } + + public static void scanner(){ + PDDocument pdfDoc; + BufferedWriter wrtTxt; + File folder = new File(folderPath); + File[] listOfFiles = folder.listFiles(); + fileCount = 0; //initialize fileCount every time + textArea.insert(FOLDERPATH_HDR + folderPath + "\n", 0); + textArea.update(textArea.getGraphics()); + try{ + if(!folder.exists()){ + textArea.insert(ERR_FOLDERPATHNOTEXISTS_MSG + "\n", 0); + textArea.update(textArea.getGraphics()); + btnConfirmFolderPath.setEnabled(true); + } + } catch(Exception e){ + + } + + + for (File listOfFile : listOfFiles) { + //System.out.println(listOfFile); + String fileInputPath = listOfFile.getAbsolutePath(); // path + fileCount++; + textArea.insert("[" + fileCount + "/" + folder.listFiles().length + "] SCANNED FILE PATH: " + fileInputPath + "\n", 0); + textArea.update(textArea.getGraphics()); + //System.out.println("[" + fileCount + "/" + folder.listFiles().length + "] Scanned File Path: " + fileInputPath); + String fileOutputPath = fileInputPath.substring(0, fileInputPath.length() - 4); + + //System.out.println(fileOutputPath); + if(! ".pdf".equals(fileInputPath.substring(fileInputPath.length() - 4)) ){ + textArea.insert(ERR_FILENOTFOUND_MSG + "\n", 0); + textArea.update(textArea.getGraphics()); + //System.out.println(ERR_FILENOTFOUND_MSG); + } + else{ + try { + File input = new File(fileInputPath); // The PDF file from where you would like to extract + File output = new File(fileOutputPath + ".txt"); // The text file where you are going to store the extracted data + pdfDoc = PDDocument.load(input); + //System.out.println(pdfDoc.getNumberOfPages()); + if(pdfDoc.isEncrypted()){ + textArea.insert(ERR_FILEENCRYPTED_MSG + "\n", 0); + textArea.update(textArea.getGraphics()); + //System.out.println(ERR_FILEENCRYPTED_MSG); + } + //pdfDoc.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf" + PDFTextStripper stripper = new PDFTextStripper(); + String text = stripper.getText(pdfDoc); + text = text.replaceAll("[^a-zA-Z0-9\uAC00-\uD7A3 -]", " "); + text = text.trim(); + String textSplited[] = text.split("\\s+"); + //System.out.println(text); + for (int i = 0; i < textSplited.length; i++) { + text = textSplited[i]; + map.put(text, (map.get(text) == null ? 1 : (map.get(text) + 1)) ); + } + //System.out.println(text); + wrtTxt = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); + stripper.writeText(pdfDoc, wrtTxt); + if (pdfDoc != null) { + pdfDoc.close(); + } + // I use close() to flush the stream. + wrtTxt.close(); + output.delete(); + } catch (Exception e){ + e.printStackTrace(); + } + } + for (String key : map.keySet()){ + + if( (map.get(key) >= MIN_DUPLICATION_COUNT)&&(map.get(key) <= MAX_DUPLICATION_COUNT) ){ + if(fileCount == folder.listFiles().length){ + textAreaResultWindow.append(key + " ("+ map.get(key) +")\n"); + textAreaResultWindow.update(textAreaResultWindow.getGraphics()); + //System.out.println("[Result] Dup Word: Dup Times = " + key + ": "+ map.get(key)); + } + /*display results for each file + if(fileCount == scanCount){ + textArea.insert("[SCAN COMPLETED] " + folderPath ,0); + }*/ + } + + } + + } + + + } + + + + public static String millisToShortDHMS(long duration) { + String res; + long days = TimeUnit.MILLISECONDS.toDays(duration); + long hours = TimeUnit.MILLISECONDS.toHours(duration) + - TimeUnit.DAYS.toHours(TimeUnit.MILLISECONDS.toDays(duration)); + long minutes = TimeUnit.MILLISECONDS.toMinutes(duration) + - TimeUnit.HOURS.toMinutes(TimeUnit.MILLISECONDS.toHours(duration)); + long seconds = TimeUnit.MILLISECONDS.toSeconds(duration) + - TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(duration)); + String wordSecond = "SECOND", wordMinute ="MINUTE", wordHour = "HOUR", wordDay = "DAY"; + if(seconds > 1){ + wordSecond = "SECONDS"; + } + if(minutes > 1){ + wordMinute = "MINUTES"; + } + if(hours > 1){ + wordHour = "HOURS"; + } + if(days > 1){ + wordDay = "DAYS"; + } + + + if (minutes == 0 && hours == 0 && days == 0) { + res = String.format("%d " + wordSecond, seconds); + } + else if (hours == 0 && days == 0){ + res = String.format("%d " + wordMinute +" %d " + wordSecond, minutes, seconds); + } + else if (days == 0){ + res = String.format("%d " + wordHour + " %d " + wordMinute + " %d " + wordSecond, hours, minutes, seconds); + } + else{ + res = String.format("%d "+ wordDay + "%d " + wordHour + " %d " + wordMinute + " %d " + wordSecond, days, hours, minutes, seconds); + } + return res; + } + + +}