What this is
This file is included in the DevDaily.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn
Java by Example" TM.
Other links
The source code
/* Created by bgalbs on Jan 30, 2003 at 11:45:25 PM */
package com.swabunga.spell.engine;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.security.InvalidParameterException;
import java.util.*;
/**
* Container for various methods that any SpellDictionary will use.
* Based on the original Jazzy aspell port.
*
*
*
*/
public abstract class SpellDictionaryASpell implements SpellDictionary {
/** The reference to a Transformator, used to transform a word into it's phonetic code. */
protected Transformator tf;
public SpellDictionaryASpell(File phonetic) throws IOException {
if (phonetic == null)
tf = new DoubleMeta();
else
tf = new GenericTransformator(phonetic);
}
public SpellDictionaryASpell(File phonetic, String encoding) throws IOException {
if (phonetic == null)
tf = new DoubleMeta();
else
tf = new GenericTransformator(phonetic, encoding);
}
public SpellDictionaryASpell(Reader phonetic) throws IOException {
if (phonetic == null)
tf = new DoubleMeta();
else
tf = new GenericTransformator(phonetic);
}
/**
* Returns a list of Word objects that are the suggestions to an
* incorrect word.
*
* @param word Suggestions for given mispelt word
* @param threshold The lower boundary of similarity to mispelt word
* @return Vector a List of suggestions
*/
public List getSuggestions(String word, int threshold) {
Hashtable nearmisscodes = new Hashtable();
String code = getCode(word);
// add all words that have the same phonetics
nearmisscodes.put(code, code);
Vector phoneticList = getWordsFromCode(word, nearmisscodes);
// do some tranformations to pick up more results
//interchange
nearmisscodes = new Hashtable();
char[] charArray = word.toCharArray();
for (int i = 0; i < word.length() - 1; i++) {
char a = charArray[i];
char b = charArray[i + 1];
charArray[i] = b;
charArray[i + 1] = a;
String s = getCode(new String(charArray));
nearmisscodes.put(s, s);
charArray[i] = a;
charArray[i + 1] = b;
}
char[] replacelist = tf.getReplaceList();
//change
charArray = word.toCharArray();
for (int i = 0; i < word.length(); i++) {
char original = charArray[i];
for (int j = 0; j < replacelist.length; j++) {
charArray[i] = replacelist[j];
String s = getCode(new String(charArray));
nearmisscodes.put(s, s);
}
charArray[i] = original;
}
//add
charArray = (word += " ").toCharArray();
int iy = charArray.length - 1;
while (true) {
for (int j = 0; j < replacelist.length; j++) {
charArray[iy] = replacelist[j];
String s = getCode(new String(charArray));
nearmisscodes.put(s, s);
}
if (iy == 0)
break;
charArray[iy] = charArray[iy - 1];
--iy;
}
//delete
word = word.trim();
charArray = word.toCharArray();
char[] charArray2 = new char[charArray.length - 1];
for (int ix = 0; ix < charArray2.length; ix++) {
charArray2[ix] = charArray[ix];
}
char a, b;
a = charArray[charArray.length - 1];
int ii = charArray2.length;
while (true) {
String s = getCode(new String(charArray));
nearmisscodes.put(s, s);
if (ii == 0)
break;
b = a;
a = charArray2[ii - 1];
charArray2[ii - 1] = b;
--ii;
}
nearmisscodes.remove(code); //already accounted for in phoneticList
Vector wordlist = getWordsFromCode(word, nearmisscodes);
if (wordlist.size() == 0 && phoneticList.size() == 0)
addBestGuess(word, phoneticList);
// We sort a Vector at the end instead of maintaining a
// continously sorted TreeSet because everytime you add a collection
// to a treeset it has to be resorted. It's better to do this operation
// once at the end.
Collections.sort(phoneticList, new Word()); //always sort phonetic matches along the top
Collections.sort(wordlist, new Word()); //the non-phonetic matches can be listed below
phoneticList.addAll(wordlist);
return phoneticList;
}
/**
* When we don't come up with any suggestions (probably because the threshold was too strict),
* then pick the best guesses from the those words that have the same phonetic code.
* @param word - the word we are trying spell correct
* @param wordList - the linked list that will get the best guess
*/
private void addBestGuess(String word, Vector wordList) {
if (wordList.size() != 0)
throw new InvalidParameterException("the wordList vector must be empty");
int bestScore = Integer.MAX_VALUE;
String code = getCode(word);
List simwordlist = getWords(code);
LinkedList candidates = new LinkedList();
for (Iterator j = simwordlist.iterator(); j.hasNext();) {
String similar = (String) j.next();
int distance = EditDistance.getDistance(word, similar);
if (distance <= bestScore) {
bestScore = distance;
Word goodGuess = new Word(similar, distance);
candidates.add(goodGuess);
}
}
//now, only pull out the guesses that had the best score
for (Iterator iter = candidates.iterator(); iter.hasNext();) {
Word candidate = (Word) iter.next();
if (candidate.getCost() == bestScore)
wordList.add(candidate);
}
}
private Vector getWordsFromCode(String word, Hashtable codes) {
Configuration config = Configuration.getConfiguration();
Vector result = new Vector();
final int configDistance = config.getInteger(Configuration.SPELL_THRESHOLD);
for (Enumeration i = codes.keys(); i.hasMoreElements();) {
String code = (String) i.nextElement();
List simwordlist = getWords(code);
for (Iterator iter = simwordlist.iterator(); iter.hasNext();) {
String similar = (String) iter.next();
int distance = EditDistance.getDistance(word, similar);
if (distance < configDistance) {
Word w = new Word(similar, distance);
result.addElement(w);
}
}
}
return result;
}
/**
* Returns the phonetic code representing the word.
*/
public String getCode(String word) {
return tf.transform(word);
}
/**
* Returns a list of words that have the same phonetic code.
*/
protected abstract List getWords(String phoneticCode);
/**
* Returns true if the word is correctly spelled against the current word list.
*/
public boolean isCorrect(String word) {
List possible = getWords(getCode(word));
if (possible.contains(word))
return true;
//JMH should we always try the lowercase version. If I dont then capitalised
//words are always returned as incorrect.
else if (possible.contains(word.toLowerCase()))
return true;
return false;
}
}
|