/*
* UniDict.java
*
* Created on November 5, 2007, 5:41 PM
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
import java.util.*;
import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
*
* @author ljbuesch
*/
public class UniDict {
Vector<Character> Characters;
String cedict_filename;
String unistrok_filename;
String unihan_filename;
/** Creates a new instance of UniDict */
public UniDict() {
//individual characters that will comprise our dictionary
Characters=new Vector<Character> (10000);
}
/**
*Our XML format
*/
public UniDict(String xmlFile) {
}
public org.w3c.dom.Document buildXML() {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db;
org.w3c.dom.Document doc;
try {
db=dbf.newDocumentBuilder();
DOMImplementation impl=db.getDOMImplementation();
doc=impl.createDocument(null,"HanziRecognizer",null);
} catch (Exception e) {
return null;
}
//create root node
Element root=doc.getDocumentElement();
Element characters=doc.createElement("Characters");
root.appendChild(characters);
for (Character currentCharacter : this.Characters) {
System.out.println("Saving "+currentCharacter.traditional);
//create all tags we will need
Element character = doc.createElement("Character");
Element traditional=doc.createElement("Traditional");
Element simplified=doc.createElement("Simplified");
Element t_codepoint=doc.createElement("Codepoint");
Element t_radical=doc.createElement("Radical");
Element s_codepoint=doc.createElement("Codepoint");
Element s_radical=doc.createElement("Radical");
Element pronunciation=doc.createElement("Pronunciation");
Element mandarin=doc.createElement("Mandarin");
Element cantonese=doc.createElement("Cantonese");
Element definitions=doc.createElement("Definitions");
Element strokes=doc.createElement("Strokes");
//put elements in the character element
character.appendChild(traditional);
character.appendChild(simplified);
character.appendChild(pronunciation);
character.appendChild(definitions);
character.appendChild(strokes);
//Put the character in the tag (<Codepoint>#</Codepoint>)
if (currentCharacter.traditional!=0) {
t_codepoint.appendChild(doc.createTextNode(String.valueOf(currentCharacter.traditional)));
}
if (currentCharacter.simplified !=0) {
s_codepoint.appendChild(doc.createTextNode(String.valueOf(currentCharacter.simplified)));
}
/*Put codepoint and radical in their tags
*<Traditional>
* <Codepoint>...</Codepoint>
* <Radical>...</Radical>
*</Traditional>
*/
traditional.appendChild(t_codepoint);
traditional.appendChild(t_radical);
//--- same for simplified
simplified.appendChild(s_codepoint);
simplified.appendChild(s_radical);
//<Mandarin>...</Mandarin>
if (currentCharacter.mandarin!="") {
mandarin.appendChild(doc.createTextNode(currentCharacter.mandarin));
}
//<Cantonese>...</Cantonese>
if (currentCharacter.cantonese!="") {
cantonese.appendChild(doc.createTextNode(currentCharacter.cantonese));
}
/*Put mandarin and cantonese in pronunciation
*<Pronuncaiation>
* <Cantonese>...</Cantonese>
* <Mandarin>...</Mandarin>
*</Pronuncaition>
*/
pronunciation.appendChild(cantonese);
pronunciation.appendChild(mandarin);
//foreach loop here for definitions
//foreach loop for strokes here
//add character to characters tag
characters.appendChild(character);
}
return doc;
}
/**
*Load with the three files
*/
public UniDict(String cedict_file, String unistrok_file, String unihan_file){
Characters = new Vector<Character> (10000);
cedict_filename = cedict_file;
unistrok_filename = unistrok_file;
unihan_filename = unihan_file;
try {
loadCEDict(cedict_file);
loadUnistrok(unistrok_file);
loadUnihan(unihan_file);
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public void loadCEDict(String filename) throws IOException {
cedict_filename = filename;
FileInputStream fis = new FileInputStream(cedict_filename);
InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
BufferedReader reader=new BufferedReader(isr);
String line;
while((line=reader.readLine())!=null) {
if (line.trim().startsWith("#")) continue;
Character c = new Character();
StringTokenizer st=new StringTokenizer(line);
String trad=st.nextToken();
String simp=st.nextToken();
String pron=line.substring(line.indexOf('[')+1, line.lastIndexOf(']'));
pron = setPinyinUnicode(pron);
String def=line.substring(line.indexOf('/')+1, line.lastIndexOf('/'));
if (trad.length()==1) {
c.traditional= (char)trad.codePointAt(0);
} else {
continue;
}
if (simp.length()==1) {
c.simplified= (char)simp.codePointAt(0);
} else {
continue;
}
c.mandarin = pron;
c.definition=def;
c=this.getCharacter(c);
}
}
public Character getCharacter(Character c) {
int index;
index=Characters.indexOf(c);
if (index==-1) {
Characters.add(c);
return c;
} else {
return Characters.get(index);
}
}
public static void main(String[] args) throws IOException {
/*UniDict u=new UniDict();
u.loadCEDict("src\\cedict_ts.u8");
u.loadUnistrok("src\\unistroke_hanzi.u8");
u.loadUnihan("src\\Unihan.txt");
Stroke_Character s=new Stroke_Character();
Collections.sort(u.Characters,s);*/
}
public void loadUnistrok(String filename) throws IOException {
unistrok_filename = filename;
BufferedReader reader = new BufferedReader(new FileReader(unistrok_filename));
String line;
boolean simplified=false;
// go through every line
while ((line=reader.readLine())!=null) {
Character currentCharacter=new Character();
// skip if we have a comment or malformed line
if (line.length() == 0)
continue;
if (line.charAt(0) == '#') {
if (line.toLowerCase().contains("traditional")) {
simplified=false;
} else if (line.toLowerCase().contains("simplified")) {
simplified=true;
}
continue;
}
int pipe;
String unicode=line.substring(0,line.indexOf(' '));
line=line.substring(line.indexOf(" ")+1);
if (line.indexOf(" ")<0) {
continue;
}
line=line.substring(line.indexOf(" "));
pipe = line.indexOf('|');
if (pipe == -1) {
continue;
}
//since we don't know if this is a traditional or simplified, set
//it to both.
if (simplified) {
currentCharacter.simplified=(char) Integer.parseInt(unicode,16);
} else {
currentCharacter.traditional=(char) Integer.parseInt(unicode,16);
}
currentCharacter=this.getCharacter(currentCharacter);
//if simplified and traditional character are the same, and strokes already
// defined, then return, else will add strokes to database twice.
if(currentCharacter.strokes.size() > 0 && (currentCharacter.simplified == currentCharacter.traditional))
continue;
// separate my strokes from my filter.
line = line.substring(pipe + 1);
String tokline, argline;
int tokindex = line.indexOf('|');
if (tokindex != -1) {
tokline = line.substring(0, tokindex);
argline = line.substring(tokindex + 1);
} else {
argline = null;
tokline = line;
}
StringTokenizer st = new StringTokenizer(tokline);
// turn alias characters into the full name
WhileLoop: while (st.hasMoreTokens()) {
String tok = st.nextToken();
for (int i = 0; i < tok.length(); i++) {
switch (tok.charAt(i)) {
case '1':
case '2':
case '3':
case '4':
case '6':
case '7':
case '8':
case '9':
char c=tok.charAt(i);
currentCharacter.addStroke(tok.charAt(i)-'0',Double.MAX_VALUE);
break;
case 'b':
currentCharacter.addStroke(62,Double.MAX_VALUE);
break;
case 'c':
currentCharacter.addStroke(26,Double.MAX_VALUE);
break;
case 'x':
currentCharacter.addStroke(21,Double.MAX_VALUE);
break;
case 'y':
currentCharacter.addStroke(23,Double.MAX_VALUE);
break;
case '|':
break WhileLoop;
default:
/*System.out
.println("unknown symbol in kanji database: "
+ tok.charAt(i));
System.out.println(line);*/
continue;
}// end switch
}// end for
}// end while
}// end line reading
reader.close();
}
public void writeUnistrok(String filename) throws IOException {
BufferedWriter writer=new BufferedWriter(new FileWriter(filename));
for(Character character : Characters) {
//writer.write(character.toUnistrokString());
writer.newLine();
}
writer.close();
}
public void loadUnihan(String filename) throws IOException {
unihan_filename = filename;
FileInputStream fis = new FileInputStream(unihan_filename);
InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
BufferedReader reader=new BufferedReader(isr);
String line;
String unicode = "";
Character unihan_character = new Character();
//NOTE: Unihan file structure: Unicode(TAB)tag_name(TAB)line_value
while((line=reader.readLine())!=null) {
if (line.length() == 0 || line.startsWith("#"))
continue;
Character database_character = new Character();
/* while we don't know if the character is chinese or not (based off of whether
* or not it has a mandarin pronunciation), keep reading in information from
* the file until the next character is reached.*/
//if character (unicode) is different and is a chinese character, then add it to our database
if(unicode.compareTo(line.substring(2, line.indexOf(9))) != 0 && unihan_character.mandarin != null){
//if traditional is the same as the simplified (i.e. no simp or trad varients defined)
if(unihan_character.simplified == 0 && unihan_character.traditional == 0){
unihan_character.simplified = (char)Integer.parseInt(unicode, 16);
unihan_character.traditional = (char)Integer.parseInt(unicode, 16);
}
database_character = this.getCharacter(unihan_character);
database_character.cantonese = unihan_character.cantonese;
database_character.hdz = unihan_character.hdz;
database_character.radical = unihan_character.radical;
database_character.total_strokes = unihan_character.total_strokes;
//CEDict definition takes precedence over unihan definition
if(database_character.definition == null)
database_character.definition = unihan_character.definition;
unihan_character = new Character();
}
//cuts off the leading U+ and goes up to the first TAB
unicode = line.substring(2, line.indexOf(9));
//from the first occurance of a TAB, to the last occurance of a TAB
String tag_name = line.substring(line.indexOf(9)+1, line.lastIndexOf(9));
//everything after the last tab
String line_value = line.substring(line.lastIndexOf(9)+1);
//Unihan pronunciation takes precedence over CEDict
if(tag_name.compareTo("kMandarin") == 0)
unihan_character.mandarin = setPinyinUnicode(line_value.toLowerCase());
else if(tag_name.compareTo("kCantonese") == 0)
unihan_character.cantonese = line_value.toLowerCase();
else if(tag_name.compareTo("kTotalStrokes") == 0)
unihan_character.total_strokes = Integer.parseInt(line_value);
else if(tag_name.compareTo("kIRGHanyuDaZidian") == 0)
unihan_character.hdz = line_value;
else if(tag_name.compareTo("kRSUnicode") == 0)
unihan_character.radical = line_value;
else if(tag_name.compareTo("kDefinition") == 0)
unihan_character.definition = line_value;
else if(tag_name.compareTo("kSimplifiedVariant") == 0)
{
//if this traditional character contains two or more simp variants, only take the first one.
if(line_value.contains(" "))
unihan_character.simplified = (char)Integer.parseInt(line_value.substring(2, line_value.indexOf(' ')), 16);
else
unihan_character.simplified = (char)Integer.parseInt(line_value.substring(2), 16);
}
else if(tag_name.compareTo("kTraditionalVariant") == 0){
//if this simplified character contains two or more trad variants, only take the first one.
if(line_value.contains(" "))
unihan_character.traditional = (char)Integer.parseInt(line_value.substring(2, line_value.indexOf(' ')), 16);
else
unihan_character.traditional = (char)Integer.parseInt(line_value.substring(2), 16);
}
}
reader.close();
}
public void loadUniDict(String filename) throws IOException {
}
public void writeUniDict(String filename) throws IOException {
}
String setPinyinUnicode(String string_to_fix){
StringTokenizer st = new StringTokenizer(string_to_fix);
String syllable;
//a e o u i v
while(st.hasMoreTokens()){
syllable = st.nextToken();
//replace u's with u:'s (or 'v's)
if(syllable.indexOf(':') != -1)
syllable = syllable.replaceFirst("u", "\u00FC");
//else if syllable has an 'a', then it receives the tone mark.
if(syllable.indexOf('A') != -1){
int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
switch (tone){
//will replace an 'a' with the 'a' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("A", "\u0100");
break;
case 2: //second tone
syllable = syllable.replaceFirst("A", "\u00C1");
break;
case 3: //third tone
syllable = syllable.replaceFirst("A", "\u01CD");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("A", "\u00C0");
break;
default: //neutral tone
break;
}
}
//else if syllable has an 'a', then it receives the tone mark.
else if(syllable.indexOf('a') != -1){
int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
switch (tone){
//will replace an 'a' with the 'a' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("a", "\u0101");
break;
case 2: //second tone
syllable = syllable.replaceFirst("a", "\u00E1");
break;
case 3: //third tone
syllable = syllable.replaceFirst("a", "\u01CE");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("a", "\u00E0");
break;
default: //neutral tone
break;
}
}
//if there is an 'E' then it is next in line to take precedence for tone mark
else if(syllable.indexOf('E') != -1){
int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
switch (tone){
//will replace an 'e' with the 'e' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("E", "\u0112");
break;
case 2: //second tone
syllable = syllable.replaceFirst("E", "\u00C9");
break;
case 3: //third tone
syllable = syllable.replaceFirst("E", "\u011A");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("E", "\u00C8");
break;
case 5: //neutral tone
break;
default: //neutral tone
break;
}
}
//if there is no 'a', then 'e' take precedence for tone mark
else if(syllable.indexOf('e') != -1){
int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
switch (tone){
//will replace an 'e' with the 'e' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("e", "\u0113");
break;
case 2: //second tone
syllable = syllable.replaceFirst("e", "\u00E9");
break;
case 3: //third tone
syllable = syllable.replaceFirst("e", "\u011B");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("e", "\u00E8");
break;
default: //neutral tone
break;
}
}
//if there is no 'a' and no 'e', then place a tone mark over the 'o' in
//the "ou" sequence if it exists.
else if(syllable.indexOf("ou") != -1){
int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
switch (tone){
//will replace an 'o' with the 'o' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("o", "\u014D");
break;
case 2: //second tone
syllable = syllable.replaceFirst("o", "\u00F3");
break;
case 3: //third tone
syllable = syllable.replaceFirst("o", "\u01D2");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("o", "\u00F2");
break;
default: //neutral tone
break;
}
}
//if none of the other cases occur, then place the tone mark over
//the last vowel of the syllable.
else{
int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
int location_of_o = syllable.indexOf('o');
int location_of_i = syllable.indexOf('i');
int location_of_u = syllable.indexOf('u');
int location_of_v = syllable.indexOf(':'); //v represented by syllable:tone
//if 'o' is last character in syllable
if(location_of_o > location_of_i && location_of_o > location_of_u){
switch (tone){
//will replace an 'o' with the 'o' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("o", "\u014D");
break;
case 2: //second tone
syllable = syllable.replaceFirst("o", "\u00F3");
break;
case 3: //third tone
syllable = syllable.replaceFirst("o", "\u01D2");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("o", "\u00F2");
break;
default: //neutral tone
break;
}
}
//else if 'i' is the last character in syllable
else if(location_of_i > location_of_o && location_of_i > location_of_u){
switch (tone){
//will replace an 'i' with the 'i' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("i", "\u012B");
break;
case 2: //second tone
syllable = syllable.replaceFirst("i", "\u00ED");
break;
case 3: //third tone
syllable = syllable.replaceFirst("i", "\u01D0");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("i", "\u00EC");
break;
default: //neutral tone
break;
}
}
//else if 'v' is the last character in syllable
else if(location_of_u > location_of_o && location_of_u > location_of_i && location_of_v != -1){
switch (tone){
//will replace an 'v' with the 'v' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("\u00FC", "\u01D6");
break;
case 2: //second tone
syllable = syllable.replaceFirst("\u00FC", "\u01D8");
break;
case 3: //third tone
syllable = syllable.replaceFirst("\u00FC", "\u01DA");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("\u00FC", "\u01DC");
break;
default: //neutral tone
break;
}
}
//else if 'u' is the last character in syllable
else if(location_of_u > location_of_o && location_of_u > location_of_i){
switch (tone){
//will replace an 'u' with the 'u' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("u", "\u016B");
break;
case 2: //second tone
syllable = syllable.replaceFirst("u", "\u00FA");
break;
case 3: //third tone
syllable = syllable.replaceFirst("u", "\u01D4");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("u", "\u00F9");
break;
default: //neutral tone
break;
}
}
//else if there is an 'O'
else if(location_of_u == -1 && location_of_i == -1 && location_of_o ==-1 &&
syllable.indexOf('O') != -1){
switch (tone){
//will replace an 'O' with the 'O' with proper tone mark
case 1: //first tone
syllable = syllable.replaceFirst("O", "\u014C");
break;
case 2: //second tone
syllable = syllable.replaceFirst("O", "\u00D3");
break;
case 3: //third tone
syllable = syllable.replaceFirst("O", "\u01D1");
break;
case 4: //fourth tone
syllable = syllable.replaceFirst("O", "\u00D2");
break;
default: //neutral tone
break;
}//end switch
}//end else if
}//end else
//removes trailing number from syllable.
syllable = syllable.substring(0, syllable.length()-1);
//if syllable has 'v' remove trailing ':'
if(syllable.indexOf(':') != -1)
syllable = syllable.substring(0, syllable.length()-1);
string_to_fix = syllable;
}
return string_to_fix;
}
public Vector<String> getCompoudCharacterWords(Vector<Character> characters){
Vector<String> dictionary_entries = new Vector<String>();
BufferedReader reader;
String line;
String hanzi_sequence = "";
for(Character next_character : characters)
hanzi_sequence += next_character.traditional;
try{
FileInputStream fis = new FileInputStream(cedict_filename);
InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
reader=new BufferedReader(isr);
while((line=reader.readLine())!=null) {
String line_hanzi = line.substring(0, line.indexOf('['));
if (line_hanzi.contains(hanzi_sequence)){
String formatted_line = "";
//hanzi
formatted_line += line.substring(0, line.indexOf('['));
//pronunciation
formatted_line += line.substring(line.indexOf('[')+1, line.indexOf(']')) + " ";
//definition
formatted_line += line.substring(line.indexOf('/')+1,line.lastIndexOf('/'));
//System.out.println(formatted_line);
dictionary_entries.add(formatted_line);
}
}
}catch (Exception e) {
System.out.println(e.getMessage());
}
return dictionary_entries;
}
public Vector<Character> FindCharacterByPinyin(Vector<String> pinyin){
Vector<Character> character_matches = new Vector<Character>();
for(String this_pinyin : pinyin)
{
for(Character current_char : Characters){
if(current_char.mandarin.equalsIgnoreCase(this_pinyin))
character_matches.add(current_char);
}
}
return character_matches;
}
public Vector<Character> FindCharacterByDefinition(String definition){
Vector<Character> character_matches = new Vector<Character>();
for(Character current_char : Characters){
if(current_char.definition.equalsIgnoreCase(definition))
character_matches.add(current_char);
}
return character_matches;
}
public Vector<String> FindUnidictEntryByDefinition(String definition){
Vector<String> dictionary_entries = new Vector<String>();
BufferedReader reader;
String line;
try{
FileInputStream fis = new FileInputStream(cedict_filename);
InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
reader=new BufferedReader(isr);
while((line=reader.readLine())!=null) {
String line_definition = line.substring(line.indexOf('/')+1,line.lastIndexOf('/'));
if (line_definition.contains(definition)){
String formatted_line = "";
//hanzi
formatted_line += line.substring(0, line.indexOf('['));
//pronunciation
formatted_line += line.substring(line.indexOf('[')+1, line.indexOf(']')) + " ";
//definition
formatted_line += line.substring(line.indexOf('/')+1,line.lastIndexOf('/'));
//System.out.println(formatted_line);
dictionary_entries.add(formatted_line);
}
}
}catch (Exception e) {
System.out.println(e.getMessage());
}
return dictionary_entries;
}
}