This commit is contained in:
Jay 2014-10-26 19:15:38 -07:00
commit 1ff1207619
8 changed files with 1943 additions and 0 deletions

73
build.xml Executable file
View file

@ -0,0 +1,73 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- You may freely edit this file. See commented blocks below for -->
<!-- some examples of how to customize the build. -->
<!-- (If you delete it and reopen the project it will be recreated.) -->
<!-- By default, only the Clean and Build commands use this build script. -->
<!-- Commands such as Run, Debug, and Test only use this build script if -->
<!-- the Compile on Save feature is turned off for the project. -->
<!-- You can turn off the Compile on Save (or Deploy on Save) setting -->
<!-- in the project's Project Properties dialog box.-->
<project name="WordAnalyzer" default="default" basedir=".">
<description>Builds, tests, and runs the project WordAnalyzer.</description>
<import file="nbproject/build-impl.xml"/>
<!--
There exist several targets which are by default empty and which can be
used for execution of your tasks. These targets are usually executed
before and after some main targets. They are:
-pre-init: called before initialization of project properties
-post-init: called after initialization of project properties
-pre-compile: called before javac compilation
-post-compile: called after javac compilation
-pre-compile-single: called before javac compilation of single file
-post-compile-single: called after javac compilation of single file
-pre-compile-test: called before javac compilation of JUnit tests
-post-compile-test: called after javac compilation of JUnit tests
-pre-compile-test-single: called before javac compilation of single JUnit test
-post-compile-test-single: called after javac compilation of single JUunit test
-pre-jar: called before JAR building
-post-jar: called after JAR building
-post-clean: called after cleaning build products
(Targets beginning with '-' are not intended to be called on their own.)
Example of inserting an obfuscator after compilation could look like this:
<target name="-post-compile">
<obfuscate>
<fileset dir="${build.classes.dir}"/>
</obfuscate>
</target>
For list of available properties check the imported
nbproject/build-impl.xml file.
Another way to customize the build is by overriding existing main targets.
The targets of interest are:
-init-macrodef-javac: defines macro for javac compilation
-init-macrodef-junit: defines macro for junit execution
-init-macrodef-debug: defines macro for class debugging
-init-macrodef-java: defines macro for class execution
-do-jar: JAR building
run: execution of project
-javadoc-build: Javadoc generation
test-report: JUnit report generation
An example of overriding the target for project execution could look like this:
<target name="run" depends="WordAnalyzer-impl.jar">
<exec dir="bin" executable="launcher.exe">
<arg file="${dist.jar}"/>
</exec>
</target>
Notice that the overridden target depends on the jar target and not only on
the compile target as the regular run target does. Again, for a list of available
properties which you can use, check the target you are overriding in the
nbproject/build-impl.xml file.
-->
</project>

3
manifest.mf Executable file
View file

@ -0,0 +1,3 @@
Manifest-Version: 1.0
X-COMMENT: Main-Class will be added automatically by build

1413
nbproject/build-impl.xml Executable file

File diff suppressed because it is too large Load diff

8
nbproject/genfiles.properties Executable file
View file

@ -0,0 +1,8 @@
build.xml.data.CRC32=49261115
build.xml.script.CRC32=7bcfe82a
build.xml.stylesheet.CRC32=8064a381@1.75.1.48
# This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml.
# Do not edit this file. You may delete it but then the IDE will never regenerate such files for you.
nbproject/build-impl.xml.data.CRC32=49261115
nbproject/build-impl.xml.script.CRC32=9b562341
nbproject/build-impl.xml.stylesheet.CRC32=876e7a8f@1.75.1.48

91
nbproject/project.properties Executable file
View file

@ -0,0 +1,91 @@
annotation.processing.enabled=true
annotation.processing.enabled.in.editor=false
annotation.processing.processors.list=
annotation.processing.run.all.processors=true
annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output
application.title=WordAnalyzer
application.vendor=i
build.classes.dir=${build.dir}/classes
build.classes.excludes=**/*.java,**/*.form
# This directory is removed when the project is cleaned:
build.dir=build
build.generated.dir=${build.dir}/generated
build.generated.sources.dir=${build.dir}/generated-sources
# Only compile against the classpath explicitly listed here:
build.sysclasspath=ignore
build.test.classes.dir=${build.dir}/test/classes
build.test.results.dir=${build.dir}/test/results
# Uncomment to specify the preferred debugger connection transport:
#debug.transport=dt_socket
debug.classpath=\
${run.classpath}
debug.test.classpath=\
${run.test.classpath}
# Files in build.classes.dir which should be excluded from distribution jar
dist.archive.excludes=
# This directory is removed when the project is cleaned:
dist.dir=dist
dist.jar=${dist.dir}/WordAnalyzer.jar
dist.javadoc.dir=${dist.dir}/javadoc
endorsed.classpath=
excludes=
file.reference.pdfbox-app-1.8.6.jar=C:\\Users\\i\\Documents\\NetBeansProjects\\pdfbox-app-1.8.6.jar
includes=**
jar.archive.disabled=${jnlp.enabled}
jar.compress=false
jar.index=${jnlp.enabled}
javac.classpath=\
${file.reference.pdfbox-app-1.8.6.jar}
# Space-separated list of extra javac options
javac.compilerargs=
javac.deprecation=false
javac.processorpath=
javac.source=1.8
javac.target=1.8
javac.test.classpath=\
${javac.classpath}:\
${build.classes.dir}
javac.test.processorpath=\
${javac.test.classpath}
javadoc.additionalparam=
javadoc.author=false
javadoc.encoding=${source.encoding}
javadoc.noindex=false
javadoc.nonavbar=false
javadoc.notree=false
javadoc.private=false
javadoc.splitindex=true
javadoc.use=true
javadoc.version=false
javadoc.windowtitle=
jnlp.codebase.type=no.codebase
jnlp.descriptor=application
jnlp.enabled=false
jnlp.mixed.code=default
jnlp.offline-allowed=false
jnlp.signed=false
jnlp.signing=
jnlp.signing.alias=
jnlp.signing.keystore=
main.class=wordanalyzer.WordAnalyzer
# Optional override of default Codebase manifest attribute, use to prevent RIAs from being repurposed
manifest.custom.codebase=
# Optional override of default Permissions manifest attribute (supported values: sandbox, all-permissions)
manifest.custom.permissions=
manifest.file=manifest.mf
meta.inf.dir=${src.dir}/META-INF
mkdist.disabled=false
platform.active=default_platform
run.classpath=\
${javac.classpath}:\
${build.classes.dir}
# Space-separated list of JVM arguments used when running the project.
# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value.
# To set system properties for unit tests define test-sys-prop.name=value:
run.jvmargs=
run.test.classpath=\
${javac.test.classpath}:\
${build.test.classes.dir}
source.encoding=UTF-8
src.dir=src
test.src.dir=test

15
nbproject/project.xml Executable file
View file

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://www.netbeans.org/ns/project/1">
<type>org.netbeans.modules.java.j2seproject</type>
<configuration>
<data xmlns="http://www.netbeans.org/ns/j2se-project/3">
<name>WordAnalyzer</name>
<source-roots>
<root id="src.dir"/>
</source-roots>
<test-roots>
<root id="test.src.dir"/>
</test-roots>
</data>
</configuration>
</project>

BIN
pdfbox-1.8.16.jar Executable file

Binary file not shown.

View file

@ -0,0 +1,340 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package wordanalyzer;
import java.util.*;
import java.io.*;
import java.awt.*;
import javax.swing.*;
import java.awt.event.*;
import java.util.concurrent.TimeUnit;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.util.*;
/**
*
* @author i
*/
public class WordAnalyzer {
public static int MAX_DUPLICATION_COUNT = 0;
public static int MIN_DUPLICATION_COUNT = 0;
public static int MAX_FILE_COUNT = 0;
public static final String DIVIDING_LINE_SINGLE = "______________________________________________________________________________________\n";
public static final String DIVIDING_LINE_DOUBLE = "======================================================================================\n";
public static final String FOLDERPATH_HDR = "[FOLDER PATH] ";
public static final String ERR_FILENOTFOUND_MSG = "[ERROR] PDF FILE NOT FOUND";
public static final String ERR_FILEENCRYPTED_MSG = "[ERROR] PDF FILE IS ENCRYPTED";
public static final String ERR_FOLDERPATHNOTEXISTS_MSG = "[ERROR] FOLDER PATH NOT EXISTS";
public static final String INTRODUCTION = "WELCOME TO WORD ANALYZER!\nAUTHOR: JUYOUNG LEE LAST UPDATED: 10-30-2014 VERSION: 1.0\n";
public static Map<String, Integer> map = new TreeMap<>();
public static int fileCount = 0;
public static String folderPath;
public static long startTime = 0, finishTime = 0;
//public static JLabel textLabel = new JLabel();
public static JTextArea textArea = new JTextArea(14, 55);
public static JTextArea textAreaResultWindow = new JTextArea(8, 55);
public static JScrollPane scrollPane = new JScrollPane(textArea);
public static JScrollPane scrollPaneResultWindow = new JScrollPane(textAreaResultWindow);
public static JButton btnConfirmFolderPath = new JButton(" Confirm Folder Path ");
/**
* @param args the command line arguments
*/
public static void main(String[] args){
window();
}
public static void window(){
Dimension screenSize = Toolkit.getDefaultToolkit().getScreenSize();
double width = screenSize.getWidth();
double height = screenSize.getHeight();
//Create and set up the window.
JFrame frame = new JFrame("Word Analyzer");
JPanel panel = new JPanel();
JMenuBar menuBar = new JMenuBar();
JMenu fileMenu = new JMenu("File");
JMenu editMenu = new JMenu("Edit");
JMenu viewMenu = new JMenu("View");
JMenu toolsMenu = new JMenu("Tools");
JMenu windowMenu = new JMenu("Window");
JMenu helpMenu = new JMenu("Help");
JMenuItem newAction = new JMenuItem("New");
JMenuItem openAction = new JMenuItem("Open");
JMenuItem saveAction = new JMenuItem("Save");
JMenuItem exitAction = new JMenuItem("Exit");
frame.setJMenuBar(menuBar);
menuBar.add(fileMenu);
menuBar.add(editMenu);
menuBar.add(viewMenu);
menuBar.add(toolsMenu);
menuBar.add(windowMenu);
menuBar.add(helpMenu);
fileMenu.add(newAction);
fileMenu.add(openAction);
fileMenu.add(saveAction);
fileMenu.add(exitAction);
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
frame.setMinimumSize(new Dimension((int)(width * 0.5),(int)(height * 0.7)));
//JLabel textLabel = new JLabel("This is a label in the window",SwingConstants.CENTER);
//frame.getContentPane().add(textLabel, BorderLayout.CENTER);
//Display the window.
frame.setLocationRelativeTo(null); // centers a frame onscreen.
frame.pack();
frame.setVisible(true);
frame.setResizable(false);
JTextField txtboxFolderPath = new JTextField(34);
txtboxFolderPath.setMargin(new Insets(2, 2, 2, 2));
txtboxFolderPath.setText("C:\\");
JLabel lblMaxDupCount = new JLabel("MAX. DUP. COUNT: ");
JTextField txtboxMaxDupCount = new JTextField(5);
txtboxMaxDupCount.setMargin(new Insets(2, 2, 2, 2));
txtboxMaxDupCount.setText("0");
JLabel lblMinDupCount = new JLabel("MIN. DUP. COUNT: ");
JTextField txtboxMinDupCount = new JTextField(5);
txtboxMinDupCount.setMargin(new Insets(2, 2, 2, 2));
txtboxMinDupCount.setText("0");
JLabel lblMaxFileCount = new JLabel("MAX. FILE COUNT (OPTIONAL): ");
JTextField txtboxMaxFileCount = new JTextField(5);
txtboxMaxFileCount.setMargin(new Insets(2, 2, 2, 2));
//textLabel.setFont(new Font("Serif", Font.PLAIN, 14));
//textLabel.setVerticalTextPosition(JLabel.BOTTOM);
//textLabel.setHorizontalTextPosition(JLabel.CENTER);
//JLabel textLabel = new JLabel();
textArea.setEditable(false);
textArea.setLineWrap(true);
textArea.setWrapStyleWord(true);
textArea.setText(DIVIDING_LINE_SINGLE + INTRODUCTION + DIVIDING_LINE_DOUBLE);
textArea.setMargin(new Insets(10,10,10,10));
scrollPane.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
textAreaResultWindow.setEditable(false);
textAreaResultWindow.setLineWrap(true);
textAreaResultWindow.setWrapStyleWord(true);
textAreaResultWindow.setMargin(new Insets(10,10,10,10));
scrollPaneResultWindow.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
JButton btnStopScan = new JButton(" Stop Scan ");
btnStopScan.setEnabled(false);
btnStopScan.addActionListener(new ActionListener()
{
@Override
public void actionPerformed(ActionEvent e)
{
btnStopScan.setEnabled(false);
btnConfirmFolderPath.setEnabled(true);
}
});
//btnConfirmFolderPath.setEnabled(true);
btnConfirmFolderPath.addActionListener(new ActionListener()
{
@Override
public void actionPerformed(ActionEvent e)
{
String strMinDupCount = txtboxMinDupCount.getText();
String strMaxDupCount = txtboxMaxDupCount.getText();
String strMaxFileCount = txtboxMaxFileCount.getText();
try{
MIN_DUPLICATION_COUNT = Integer.parseInt(strMinDupCount);
} catch (Exception ex){
MIN_DUPLICATION_COUNT = 0;
}
try{
MAX_DUPLICATION_COUNT = Integer.parseInt(strMaxDupCount);
} catch (Exception ex){
MAX_DUPLICATION_COUNT = 0;
}
try{
MAX_FILE_COUNT = Integer.parseInt(strMaxFileCount);
} catch (Exception ex){
MAX_FILE_COUNT = 1000000000;
}
btnConfirmFolderPath.setEnabled(false);
folderPath = txtboxFolderPath.getText();
startTime = System.currentTimeMillis();
textAreaResultWindow.append(DIVIDING_LINE_SINGLE + "[RESULT] DUP WORD (DUP TIMES)\n");
scanner();
btnConfirmFolderPath.setEnabled(true);
finishTime = System.currentTimeMillis();
String exeTime = millisToShortDHMS(finishTime - startTime);
textArea.insert("[SCAN COMPLETED] " + folderPath +"\n" ,0);
textArea.append("[LOG] SCANNED FOLDER: " + folderPath + "\n[LOG] NUMBER OF FILES SCANNED: " + fileCount + "\n"
+"[LOG] TOTAL TIME: " + exeTime + "\n" + DIVIDING_LINE_SINGLE);
textArea.insert(DIVIDING_LINE_SINGLE, 0);
textArea.update(textArea.getGraphics());
textArea.setCaretPosition(textArea.getDocument().getLength());
}
});
//panel.setLayout(new GridLayout(1, 1));
frame.add(panel);
panel.add(txtboxFolderPath);
panel.add(btnConfirmFolderPath);
panel.add(btnStopScan);
panel.add(lblMinDupCount);
panel.add(txtboxMinDupCount);
panel.add(lblMaxDupCount);
panel.add(txtboxMaxDupCount);
panel.add(lblMaxFileCount);
panel.add(txtboxMaxFileCount);
//panel.add(textLabel);
//panel.add(textArea); // I don't need to add textArea again into other container <http://stackoverflow.com/questions/10177183/java-add-scroll-into-text-area>
panel.add(scrollPane);
panel.add(scrollPaneResultWindow);
}
public static void scanner(){
PDDocument pdfDoc;
BufferedWriter wrtTxt;
File folder = new File(folderPath);
File[] listOfFiles = folder.listFiles();
fileCount = 0; //initialize fileCount every time
textArea.insert(FOLDERPATH_HDR + folderPath + "\n", 0);
textArea.update(textArea.getGraphics());
try{
if(!folder.exists()){
textArea.insert(ERR_FOLDERPATHNOTEXISTS_MSG + "\n", 0);
textArea.update(textArea.getGraphics());
btnConfirmFolderPath.setEnabled(true);
}
} catch(Exception e){
}
for (File listOfFile : listOfFiles) {
//System.out.println(listOfFile);
String fileInputPath = listOfFile.getAbsolutePath(); // path
fileCount++;
textArea.insert("[" + fileCount + "/" + folder.listFiles().length + "] SCANNED FILE PATH: " + fileInputPath + "\n", 0);
textArea.update(textArea.getGraphics());
//System.out.println("[" + fileCount + "/" + folder.listFiles().length + "] Scanned File Path: " + fileInputPath);
String fileOutputPath = fileInputPath.substring(0, fileInputPath.length() - 4);
//System.out.println(fileOutputPath);
if(! ".pdf".equals(fileInputPath.substring(fileInputPath.length() - 4)) ){
textArea.insert(ERR_FILENOTFOUND_MSG + "\n", 0);
textArea.update(textArea.getGraphics());
//System.out.println(ERR_FILENOTFOUND_MSG);
}
else{
try {
File input = new File(fileInputPath); // The PDF file from where you would like to extract
File output = new File(fileOutputPath + ".txt"); // The text file where you are going to store the extracted data
pdfDoc = PDDocument.load(input);
//System.out.println(pdfDoc.getNumberOfPages());
if(pdfDoc.isEncrypted()){
textArea.insert(ERR_FILEENCRYPTED_MSG + "\n", 0);
textArea.update(textArea.getGraphics());
//System.out.println(ERR_FILEENCRYPTED_MSG);
}
//pdfDoc.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf"
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pdfDoc);
text = text.replaceAll("[^a-zA-Z0-9\uAC00-\uD7A3 -]", " ");
text = text.trim();
String textSplited[] = text.split("\\s+");
//System.out.println(text);
for (int i = 0; i < textSplited.length; i++) {
text = textSplited[i];
map.put(text, (map.get(text) == null ? 1 : (map.get(text) + 1)) );
}
//System.out.println(text);
wrtTxt = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
stripper.writeText(pdfDoc, wrtTxt);
if (pdfDoc != null) {
pdfDoc.close();
}
// I use close() to flush the stream.
wrtTxt.close();
output.delete();
} catch (Exception e){
e.printStackTrace();
}
}
for (String key : map.keySet()){
if( (map.get(key) >= MIN_DUPLICATION_COUNT)&&(map.get(key) <= MAX_DUPLICATION_COUNT) ){
if(fileCount == folder.listFiles().length){
textAreaResultWindow.append(key + " ("+ map.get(key) +")\n");
textAreaResultWindow.update(textAreaResultWindow.getGraphics());
//System.out.println("[Result] Dup Word: Dup Times = " + key + ": "+ map.get(key));
}
/*display results for each file
if(fileCount == scanCount){
textArea.insert("[SCAN COMPLETED] " + folderPath ,0);
}*/
}
}
}
}
public static String millisToShortDHMS(long duration) {
String res;
long days = TimeUnit.MILLISECONDS.toDays(duration);
long hours = TimeUnit.MILLISECONDS.toHours(duration)
- TimeUnit.DAYS.toHours(TimeUnit.MILLISECONDS.toDays(duration));
long minutes = TimeUnit.MILLISECONDS.toMinutes(duration)
- TimeUnit.HOURS.toMinutes(TimeUnit.MILLISECONDS.toHours(duration));
long seconds = TimeUnit.MILLISECONDS.toSeconds(duration)
- TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(duration));
String wordSecond = "SECOND", wordMinute ="MINUTE", wordHour = "HOUR", wordDay = "DAY";
if(seconds > 1){
wordSecond = "SECONDS";
}
if(minutes > 1){
wordMinute = "MINUTES";
}
if(hours > 1){
wordHour = "HOURS";
}
if(days > 1){
wordDay = "DAYS";
}
if (minutes == 0 && hours == 0 && days == 0) {
res = String.format("%d " + wordSecond, seconds);
}
else if (hours == 0 && days == 0){
res = String.format("%d " + wordMinute +" %d " + wordSecond, minutes, seconds);
}
else if (days == 0){
res = String.format("%d " + wordHour + " %d " + wordMinute + " %d " + wordSecond, hours, minutes, seconds);
}
else{
res = String.format("%d "+ wordDay + "%d " + wordHour + " %d " + wordMinute + " %d " + wordSecond, days, hours, minutes, seconds);
}
return res;
}
}