devdaily home | apple | java | perl | unix | directory | blog

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Other links

The source code

package com.swabunga.spell.engine;

import com.swabunga.util.StringUtility;

import java.io.*;
import java.util.HashMap;
import java.util.Vector;

/**
 * A Generic implementation of a transformator takes an aspell phonetics file and constructs
 * some sort of transformation table using the inner class Rule.
 *
 * @author Robert Gustavsson (robert@lindesign.se)
 */
public class GenericTransformator implements Transformator {


  /**
   * This replace list is used if no phonetic file is supplied or it doesn't
   * contain the alphabet.
   */
  private static final char[] defaultEnglishAlphabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'};


  public static final char ALPHABET_START = '[';
  public static final char ALPHABET_END = ']';
  public static final String KEYWORD_ALPHBET = "alphabet";
  public static final String[] IGNORED_KEYWORDS = {"version", "followup", "collapse_result"};

  public static final char STARTMULTI = '(';
  public static final char ENDMULTI = ')';
  public static final String DIGITCODE = "0";
  public static final String REPLACEVOID = "_";

  private Object[] ruleArray = null;
  private char[] alphabetString = defaultEnglishAlphabet;

  public GenericTransformator(File phonetic) throws IOException {
    buildRules(new BufferedReader(new FileReader(phonetic)));
    alphabetString = washAlphabetIntoReplaceList(getReplaceList());

  }

  public GenericTransformator(File phonetic, String encoding) throws IOException {
    buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding)));
    alphabetString = washAlphabetIntoReplaceList(getReplaceList());
  }

  public GenericTransformator(Reader phonetic) throws IOException {
    buildRules(new BufferedReader(phonetic));
    alphabetString = washAlphabetIntoReplaceList(getReplaceList());
  }

  /**
   * Goes through an alphabet and makes sure that only one of those letters
   * that are coded equally will be in the replace list.
   * In other words, it removes any letters in the alphabet
   * that are redundant phonetically.
   *
   * This is done to improve speed in the getSuggestion method.
   *
   * @param alphabet The complete alphabet to wash.
   * @return The washed alphabet to be used as replace list.
   */
  private char[] washAlphabetIntoReplaceList(char[] alphabet) {

    HashMap letters = new HashMap(alphabet.length);

    for (int i = 0; i < alphabet.length; i++) {
      String tmp = String.valueOf(alphabet[i]);
      String code = transform(tmp);
      if (!letters.containsKey(code)) {
        letters.put(code, new Character(alphabet[i]));
      }
    }

    Object[] tmpCharacters = letters.values().toArray();
    char[] washedArray = new char[tmpCharacters.length];

    for (int i = 0; i < tmpCharacters.length; i++) {
      washedArray[i] = ((Character) tmpCharacters[i]).charValue();
    }

    return washedArray;
  }


  /**
   * Takes out all single character replacements and put them in a char array.
   * This array can later be used for adding or changing letters in getSuggestion().
   * @return char[] An array of chars with replacements characters
   */
  public char[] getCodeReplaceList() {
    char[] replacements;
    TransformationRule rule;
    Vector tmp = new Vector();

    if (ruleArray == null)
      return null;
    for (int i = 0; i < ruleArray.length; i++) {
      rule = (TransformationRule) ruleArray[i];
      if (rule.getReplaceExp().length() == 1)
        tmp.addElement(rule.getReplaceExp());
    }
    replacements = new char[tmp.size()];
    for (int i = 0; i < tmp.size(); i++) {
      replacements[i] = ((String) tmp.elementAt(i)).charAt(0);
    }
    return replacements;
  }

  /**
   * Builds up an char array with the chars in the alphabet of the language as it was read from the
   * alphabet tag in the phonetic file.
   * @return char[] An array of chars representing the alphabet or null if no alphabet was available.
   */
  public char[] getReplaceList() {
    return alphabetString;
  }

  /**
   * Returns the phonetic code of the word.
   */
  public String transform(String word) {

    if (ruleArray == null)
      return null;

    TransformationRule rule;
    StringBuffer str = new StringBuffer(word.toUpperCase());
    int strLength = str.length();
    int startPos = 0, add = 1;

    while (startPos < strLength) {

      add = 1;
      if (Character.isDigit(str.charAt(startPos))) {
        StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE);
        startPos += add;
        continue;
      }

      for (int i = 0; i < ruleArray.length; i++) {
        //System.out.println("Testing rule#:"+i);
        rule = (TransformationRule) ruleArray[i];
        if (rule.startsWithExp() && startPos > 0)
          continue;
        if (startPos + rule.lengthOfMatch() > strLength) {
          continue;
        }
        if (rule.isMatching(str, startPos)) {
          String replaceExp = rule.getReplaceExp();

          add = replaceExp.length();
          StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp);
          strLength -= rule.getTakeOut();
          strLength += add;
          //System.out.println("Replacing with rule#:"+i+" add="+add);
          break;
        }
      }
      startPos += add;
    }
    //System.out.println(word);
    //System.out.println(str.toString());
    return str.toString();
  }

  // Used to build up the transformastion table.
  private void buildRules(BufferedReader in) throws IOException {
    String read = null;
    Vector ruleList = new Vector();
    while ((read = in.readLine()) != null) {
      buildRule(realTrimmer(read), ruleList);
    }
    ruleArray = new TransformationRule[ruleList.size()];
    ruleList.copyInto(ruleArray);
  }

  // Here is where the real work of reading the phonetics file is done.
  private void buildRule(String str, Vector ruleList) {
    if (str.length() < 1)
      return;
    for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
      if (str.startsWith(IGNORED_KEYWORDS[i]))
        return;
    }

    // A different alphabet is used for this language, will be read into
    // the alphabetString variable.
    if (str.startsWith(KEYWORD_ALPHBET)) {
      int start = str.indexOf(ALPHABET_START);
      int end = str.lastIndexOf(ALPHABET_END);
      if (end != -1 && start != -1) {
        alphabetString = str.substring(++start, end).toCharArray();
      }
      return;
    }

    TransformationRule rule = null;
    StringBuffer matchExp = new StringBuffer();
    StringBuffer replaceExp = new StringBuffer();
    boolean start = false,
        end = false;
    int takeOutPart = 0,
        matchLength = 0;
    boolean match = true,
        inMulti = false;
    for (int i = 0; i < str.length(); i++) {
      if (Character.isWhitespace(str.charAt(i))) {
        match = false;
      } else {
        if (match) {
          if (!isReservedChar(str.charAt(i))) {
            matchExp.append(str.charAt(i));
            if (!inMulti) {
              takeOutPart++;
              matchLength++;
            }
            if (str.charAt(i) == STARTMULTI || str.charAt(i) == ENDMULTI)
              inMulti = !inMulti;
          }
          if (str.charAt(i) == '-')
            takeOutPart--;
          if (str.charAt(i) == '^')
            start = true;
          if (str.charAt(i) == '$')
            end = true;
        } else {
          replaceExp.append(str.charAt(i));
        }
      }
    }
    if (replaceExp.toString().equals(REPLACEVOID)) {
      replaceExp = new StringBuffer("");
      //System.out.println("Changing _ to \"\" for "+matchExp.toString());
    }
    rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end);
    //System.out.println(rule.toString());
    ruleList.addElement(rule);
  }

  // Chars with special meaning to aspell. Not everyone is implemented here.
  private boolean isReservedChar(char ch) {
    if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-' || Character.isDigit(ch))
      return true;
    return false;
  }

  // Trims off everything we don't care about.
  private String realTrimmer(String row) {
    int pos = row.indexOf('#');
    if (pos != -1) {
      row = row.substring(0, pos);
    }
    return row.trim();
  }

  // Inner Classes
  /*
  * Holds the match string and the replace string and all the rule attributes.
  * Is responsible for indicating matches.
  */
  private class TransformationRule {

    private String replace;
    private char[] match;
    // takeOut=number of chars to replace;
    // matchLength=length of matching string counting multies as one.
    private int takeOut, matchLength;
    private boolean start, end;

    // Construktor
    public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) {
      this.match = match.toCharArray();
      this.replace = replace;
      this.takeOut = takeout;
      this.matchLength = matchLength;
      this.start = start;
      this.end = end;
    }

    /*
    * Returns true if word from pos and forward matches the match string.
    * Precondition: wordPos+matchLength




Copyright 1998-2008 Alvin Alexander
All Rights Reserved.
 
devdaily.com is based in louisville, kentucky, and this web site is hosted by godaddy.com