Hanzi Recognizer Code

Status: Beta
Brought to you by: ljbuesch, marscheese
[r1]: / src / UniDict.java Maximize Restore History
739 lines (670 with data), 28.3 kB

/*
 * UniDict.java
 *
 * Created on November 5, 2007, 5:41 PM
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */

import java.util.*;

import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 *
 * @author ljbuesch
 */
public class UniDict {
    
    Vector<Character> Characters;
    String cedict_filename;
    String unistrok_filename;
    String unihan_filename;
    
    /** Creates a new instance of UniDict */
    public UniDict() {
    	//individual characters that will comprise our dictionary
        Characters=new Vector<Character> (10000);  
    }
    /**
     *Our XML format
     */
    public UniDict(String xmlFile) {
        
    }
    
    
    public org.w3c.dom.Document buildXML() {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        DocumentBuilder db;
        org.w3c.dom.Document doc;

        try {
            db=dbf.newDocumentBuilder();
            DOMImplementation impl=db.getDOMImplementation();
            doc=impl.createDocument(null,"HanziRecognizer",null);
        } catch (Exception e) {
            return null;
        }
        //create root node
        Element root=doc.getDocumentElement();
        Element characters=doc.createElement("Characters");
        root.appendChild(characters);
        for (Character currentCharacter : this.Characters) {
            System.out.println("Saving "+currentCharacter.traditional);
            //create all tags we will need
            Element character = doc.createElement("Character");
            Element traditional=doc.createElement("Traditional");
            Element simplified=doc.createElement("Simplified");
            Element t_codepoint=doc.createElement("Codepoint");
            Element t_radical=doc.createElement("Radical");
            Element s_codepoint=doc.createElement("Codepoint");
            Element s_radical=doc.createElement("Radical");
            Element pronunciation=doc.createElement("Pronunciation");
            Element mandarin=doc.createElement("Mandarin");
            Element cantonese=doc.createElement("Cantonese");
            Element definitions=doc.createElement("Definitions");
            Element strokes=doc.createElement("Strokes");

            //put elements in the character element
            character.appendChild(traditional);
            character.appendChild(simplified);
            character.appendChild(pronunciation);
            character.appendChild(definitions);
            character.appendChild(strokes);
            
            
            //Put the character in the tag (<Codepoint>#</Codepoint>)
            if (currentCharacter.traditional!=0) {
                t_codepoint.appendChild(doc.createTextNode(String.valueOf(currentCharacter.traditional)));
            }
            if (currentCharacter.simplified !=0) {
                s_codepoint.appendChild(doc.createTextNode(String.valueOf(currentCharacter.simplified)));
            }

            /*Put codepoint and radical in their tags
             *<Traditional>
             * <Codepoint>...</Codepoint>
             * <Radical>...</Radical>
             *</Traditional>
             */
            traditional.appendChild(t_codepoint);
            traditional.appendChild(t_radical);
            //--- same for simplified
            simplified.appendChild(s_codepoint);
            simplified.appendChild(s_radical);



            //<Mandarin>...</Mandarin>
            if (currentCharacter.mandarin!="") {            
                mandarin.appendChild(doc.createTextNode(currentCharacter.mandarin));
            }
            //<Cantonese>...</Cantonese>
            if (currentCharacter.cantonese!="") {
                cantonese.appendChild(doc.createTextNode(currentCharacter.cantonese));
            }
            /*Put mandarin and cantonese in pronunciation
             *<Pronuncaiation>
             * <Cantonese>...</Cantonese>
             * <Mandarin>...</Mandarin>
             *</Pronuncaition>
             */
             pronunciation.appendChild(cantonese);
             pronunciation.appendChild(mandarin);
            //foreach loop here for definitions

            //foreach loop for strokes here
             
             //add character to characters tag
             characters.appendChild(character);
        }
        return doc;
    }
    
    /**
     *Load with the three files
     */
    public UniDict(String cedict_file, String unistrok_file, String unihan_file){
    	Characters = new Vector<Character> (10000);
    	cedict_filename = cedict_file;
    	unistrok_filename = unistrok_file;
    	unihan_filename = unihan_file;    	
    	
    	try {
	    	loadCEDict(cedict_file);
	    	loadUnistrok(unistrok_file);
	    	loadUnihan(unihan_file);
    	} catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }
    
    public void loadCEDict(String filename) throws IOException {
    	cedict_filename = filename;
        FileInputStream fis = new FileInputStream(cedict_filename);
        InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
        BufferedReader reader=new BufferedReader(isr);
        String line;
        while((line=reader.readLine())!=null) {
                if (line.trim().startsWith("#")) continue; 
                Character c = new Character();
                
                StringTokenizer st=new StringTokenizer(line); 
                String trad=st.nextToken();
                String simp=st.nextToken();
                String pron=line.substring(line.indexOf('[')+1, line.lastIndexOf(']'));
                pron = setPinyinUnicode(pron);
                String def=line.substring(line.indexOf('/')+1, line.lastIndexOf('/'));
                
                if (trad.length()==1) {
                    c.traditional= (char)trad.codePointAt(0);
                } else {
                    continue;
                }
                if (simp.length()==1) {
                    c.simplified= (char)simp.codePointAt(0);
                } else {
                    continue;
                }
                c.mandarin = pron;
                c.definition=def;                
                c=this.getCharacter(c);
            }
    }
    
    public Character getCharacter(Character c) {
        int index;
        index=Characters.indexOf(c);
        if (index==-1) {
            Characters.add(c);
            return c;
        } else {
            return Characters.get(index);
        }
    }
    
    public static void main(String[] args) throws IOException {
        /*UniDict u=new UniDict();
        u.loadCEDict("src\\cedict_ts.u8");
        u.loadUnistrok("src\\unistroke_hanzi.u8");
        u.loadUnihan("src\\Unihan.txt");
        Stroke_Character s=new Stroke_Character();
        Collections.sort(u.Characters,s);*/
    }
    
    public void loadUnistrok(String filename) throws IOException {
    	unistrok_filename = filename;
         BufferedReader reader = new BufferedReader(new FileReader(unistrok_filename));
         String line;
         boolean simplified=false;
         // go through every line
        while ((line=reader.readLine())!=null) {
            Character currentCharacter=new Character();
            // skip if we have a comment or malformed line
            if (line.length() == 0)
                continue;
            if (line.charAt(0) == '#') {
                if (line.toLowerCase().contains("traditional")) {
                    simplified=false;
                } else if (line.toLowerCase().contains("simplified")) {
                    simplified=true;
                }
                continue;
            }
                 
            int pipe;
            String unicode=line.substring(0,line.indexOf(' '));
            line=line.substring(line.indexOf(" ")+1);
            if (line.indexOf(" ")<0) {
                continue;
            } 
            line=line.substring(line.indexOf(" "));

            pipe = line.indexOf('|');
            if (pipe == -1) {
                continue;
            }
            //since we don't know if this is a traditional or simplified, set
            //it to both.
            if (simplified) {
                currentCharacter.simplified=(char) Integer.parseInt(unicode,16);
            } else {
                currentCharacter.traditional=(char) Integer.parseInt(unicode,16);                
            }
            currentCharacter=this.getCharacter(currentCharacter);
            
            //if simplified and traditional character are the same, and strokes already
            // defined, then return, else will add strokes to database twice.
            if(currentCharacter.strokes.size() > 0 && (currentCharacter.simplified == currentCharacter.traditional))
            	continue;
            
            // separate my strokes from my filter.
            line = line.substring(pipe + 1);
            String tokline, argline;
            int tokindex = line.indexOf('|');
            if (tokindex != -1) {
                tokline = line.substring(0, tokindex);
                argline = line.substring(tokindex + 1);
            } else {
                argline = null;
                tokline = line;
            }
            StringTokenizer st = new StringTokenizer(tokline);

            // turn alias characters into the full name
            WhileLoop: while (st.hasMoreTokens()) {
                String tok = st.nextToken();
                for (int i = 0; i < tok.length(); i++) {
                    switch (tok.charAt(i)) {
                        case '1':
                        case '2':
                        case '3':
                        case '4':
                        case '6':
                        case '7':
                        case '8':
                        case '9':
                            char c=tok.charAt(i);
                            currentCharacter.addStroke(tok.charAt(i)-'0',Double.MAX_VALUE);
                            break;
                        case 'b':
                            currentCharacter.addStroke(62,Double.MAX_VALUE);
                            break;
                        case 'c':
                            currentCharacter.addStroke(26,Double.MAX_VALUE);
                            break;
                        case 'x':
                            currentCharacter.addStroke(21,Double.MAX_VALUE);
                            break;
                        case 'y':
                            currentCharacter.addStroke(23,Double.MAX_VALUE);
                            break;
                        case '|':
                            break WhileLoop;
                        default:
                            /*System.out
                                    .println("unknown symbol in kanji database: "
                                            + tok.charAt(i));
                            System.out.println(line);*/
                            continue;
                    }// end switch
                }// end for
            }// end while
        }// end line reading
        reader.close();
    }
    
    public void writeUnistrok(String filename) throws IOException {
        BufferedWriter writer=new BufferedWriter(new FileWriter(filename));
        
        for(Character character : Characters) {
            //writer.write(character.toUnistrokString());
            writer.newLine();
        }
        writer.close();       
    }
    
    public void loadUnihan(String filename) throws IOException {
    	unihan_filename = filename;
    	 FileInputStream fis = new FileInputStream(unihan_filename);
         InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
         BufferedReader reader=new BufferedReader(isr);
         String line;
    	 String unicode = "";
    	 Character unihan_character = new Character();
         
         //NOTE:  Unihan file structure:  Unicode(TAB)tag_name(TAB)line_value
         while((line=reader.readLine())!=null) {
        	 if (line.length() == 0 || line.startsWith("#"))
        		 continue;
        	 Character database_character = new Character();

                 /* while we don't know if the character is chinese or not (based off of whether
        	 *  or not it has a mandarin pronunciation), keep reading in information from 
        	 *  the file until the next character is reached.*/
        	         	 
        	 //if character (unicode) is different and is a chinese character, then add it to our database
        	 if(unicode.compareTo(line.substring(2, line.indexOf(9))) != 0 && unihan_character.mandarin != null){
        		 //if traditional is the same as the simplified (i.e. no simp or trad varients defined)
        		 if(unihan_character.simplified == 0 && unihan_character.traditional == 0){
        			 unihan_character.simplified = (char)Integer.parseInt(unicode, 16);
        			 unihan_character.traditional = (char)Integer.parseInt(unicode, 16);
        		 }
        		database_character = this.getCharacter(unihan_character);
        		database_character.cantonese = unihan_character.cantonese;
        		database_character.hdz = unihan_character.hdz;
        		database_character.radical = unihan_character.radical;
        		database_character.total_strokes = unihan_character.total_strokes;
        		//CEDict definition takes precedence over unihan definition
        		if(database_character.definition == null)
        			database_character.definition = unihan_character.definition;
        		unihan_character = new Character(); 
        	 }
        	 
        	//cuts off the leading U+ and goes up to the first TAB
        	 unicode = line.substring(2, line.indexOf(9));
        	 //from the first occurance of a TAB, to the last occurance of a TAB
        	 String tag_name = line.substring(line.indexOf(9)+1, line.lastIndexOf(9));  
        	 //everything after the last tab
        	 String line_value = line.substring(line.lastIndexOf(9)+1);
        	 
        	 //Unihan pronunciation takes precedence over CEDict
        	 if(tag_name.compareTo("kMandarin") == 0)
        		 unihan_character.mandarin = setPinyinUnicode(line_value.toLowerCase());
        	 else if(tag_name.compareTo("kCantonese") == 0)
        		 unihan_character.cantonese = line_value.toLowerCase();
        	 else if(tag_name.compareTo("kTotalStrokes") == 0)
        		 unihan_character.total_strokes = Integer.parseInt(line_value);
        	 else if(tag_name.compareTo("kIRGHanyuDaZidian") == 0)
        		 unihan_character.hdz = line_value;
        	 else if(tag_name.compareTo("kRSUnicode") == 0)
        		 unihan_character.radical = line_value;  
        	 else if(tag_name.compareTo("kDefinition") == 0)
        		 unihan_character.definition = line_value;    
        	 else if(tag_name.compareTo("kSimplifiedVariant") == 0)
        	 {
        		//if this traditional character contains two or more simp variants, only take the first one.
        		 if(line_value.contains(" "))
        			 unihan_character.simplified = (char)Integer.parseInt(line_value.substring(2, line_value.indexOf(' ')), 16);
        		 else
        		 unihan_character.simplified = (char)Integer.parseInt(line_value.substring(2), 16);
        	 }
        	 else if(tag_name.compareTo("kTraditionalVariant") == 0){
        		 //if this simplified character contains two or more trad variants, only take the first one.
        		 if(line_value.contains(" "))
        			 unihan_character.traditional = (char)Integer.parseInt(line_value.substring(2, line_value.indexOf(' ')), 16);
        		 else
        			 unihan_character.traditional = (char)Integer.parseInt(line_value.substring(2), 16);
        	 }
         }
         
         reader.close();
    }
    
    public void loadUniDict(String filename) throws IOException {
        
    }
    
    public void writeUniDict(String filename) throws IOException {

    }
    
    String setPinyinUnicode(String string_to_fix){
    	StringTokenizer st = new StringTokenizer(string_to_fix);
    	String syllable;
    	
    	//a e o u i v
    	while(st.hasMoreTokens()){
    		syllable = st.nextToken();
    		
    		//replace u's with u:'s (or 'v's)
    		if(syllable.indexOf(':') != -1)
    			syllable = syllable.replaceFirst("u", "\u00FC");
    		
    		//else if syllable has an 'a', then it receives the tone mark.
    		if(syllable.indexOf('A') != -1){
    			int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
    			switch (tone){
    				//will replace an 'a' with the 'a' with proper tone mark
    				case 1: //first tone
    					syllable = syllable.replaceFirst("A", "\u0100");
    					break;
    				case 2: //second tone
    					syllable = syllable.replaceFirst("A", "\u00C1");
    					break;
    				case 3: //third tone
    					syllable = syllable.replaceFirst("A", "\u01CD");
    					break;
    				case 4: //fourth tone
    					syllable = syllable.replaceFirst("A", "\u00C0");
    					break;
    				default: //neutral tone
    					break;
    			}
    		}
    		//else if syllable has an 'a', then it receives the tone mark.
    		else if(syllable.indexOf('a') != -1){
    			int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
    			switch (tone){
    				//will replace an 'a' with the 'a' with proper tone mark
    				case 1: //first tone
    					syllable = syllable.replaceFirst("a", "\u0101");
    					break;
    				case 2: //second tone
    					syllable = syllable.replaceFirst("a", "\u00E1");
    					break;
    				case 3: //third tone
    					syllable = syllable.replaceFirst("a", "\u01CE");
    					break;
    				case 4: //fourth tone
    					syllable = syllable.replaceFirst("a", "\u00E0");
    					break;
    				default: //neutral tone
    					break;
    			}
    		}
    		//if there is an 'E' then it is next in line to take precedence for tone mark
    		else if(syllable.indexOf('E') != -1){
    			int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
    			switch (tone){
    				//will replace an 'e' with the 'e' with proper tone mark
    				case 1: //first tone
    					syllable = syllable.replaceFirst("E", "\u0112");
    					break;
    				case 2: //second tone
    					syllable = syllable.replaceFirst("E", "\u00C9");
    					break;
    				case 3: //third tone
    					syllable = syllable.replaceFirst("E", "\u011A");
    					break;
    				case 4: //fourth tone
    					syllable = syllable.replaceFirst("E", "\u00C8");
    					break;
    				case 5: //neutral tone
    					break;
    				default: //neutral tone
    					break;
    			}
    		}
    		//if there is no 'a', then 'e' take precedence for tone mark
    		else if(syllable.indexOf('e') != -1){
    			int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
    			switch (tone){
    				//will replace an 'e' with the 'e' with proper tone mark
    				case 1: //first tone
    					syllable = syllable.replaceFirst("e", "\u0113");
    					break;
    				case 2: //second tone
    					syllable = syllable.replaceFirst("e", "\u00E9");
    					break;
    				case 3: //third tone
    					syllable = syllable.replaceFirst("e", "\u011B");
    					break;
    				case 4: //fourth tone
    					syllable = syllable.replaceFirst("e", "\u00E8");
    					break;
    				default: //neutral tone
    					break;
    			}
    		}
    		//if there is no 'a' and no 'e', then place a tone mark over the 'o' in
    		//the "ou" sequence if it exists.
    		else if(syllable.indexOf("ou") != -1){
    			int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
    			switch (tone){
    				//will replace an 'o' with the 'o' with proper tone mark
    				case 1: //first tone
    					syllable = syllable.replaceFirst("o", "\u014D");
    					break;
    				case 2: //second tone
    					syllable = syllable.replaceFirst("o", "\u00F3");
    					break;
    				case 3: //third tone
    					syllable = syllable.replaceFirst("o", "\u01D2");
    					break;
    				case 4: //fourth tone
    					syllable = syllable.replaceFirst("o", "\u00F2");
    					break;
    				default: //neutral tone
    					break;
    			}
    		}
    		//if none of the other cases occur, then place the tone mark over
    		//the last vowel of the syllable.
    		else{
    			int tone = new Integer(syllable.charAt(syllable.length()-1) - '0');
    			int location_of_o = syllable.indexOf('o');
    			int location_of_i = syllable.indexOf('i');
    			int location_of_u = syllable.indexOf('u');
    			int location_of_v = syllable.indexOf(':'); //v represented by syllable:tone
    			
    			//if 'o' is last character in syllable
    			if(location_of_o > location_of_i && location_of_o > location_of_u){    			
	    			switch (tone){
	    				//will replace an 'o' with the 'o' with proper tone mark
	    				case 1: //first tone
	    					syllable = syllable.replaceFirst("o", "\u014D");
	    					break;
	    				case 2: //second tone
	    					syllable = syllable.replaceFirst("o", "\u00F3");
	    					break;
	    				case 3: //third tone
	    					syllable = syllable.replaceFirst("o", "\u01D2");
	    					break;
	    				case 4: //fourth tone
	    					syllable = syllable.replaceFirst("o", "\u00F2");
	    					break;
	    				default: //neutral tone
	    					break;
	    			}
    			}
    			//else if 'i' is the last character in syllable
    			else if(location_of_i > location_of_o && location_of_i > location_of_u){    			
	    			switch (tone){
	    				//will replace an 'i' with the 'i' with proper tone mark
	    				case 1: //first tone
	    					syllable = syllable.replaceFirst("i", "\u012B");
	    					break;
	    				case 2: //second tone
	    					syllable = syllable.replaceFirst("i", "\u00ED");
	    					break;
	    				case 3: //third tone
	    					syllable = syllable.replaceFirst("i", "\u01D0");
	    					break;
	    				case 4: //fourth tone
	    					syllable = syllable.replaceFirst("i", "\u00EC");
	    					break;
	    				default: //neutral tone
	    					break;
	    			}
    			}
    			//else if 'v' is the last character in syllable
    			else if(location_of_u > location_of_o && location_of_u > location_of_i && location_of_v != -1){    			
	    			switch (tone){
	    				//will replace an 'v' with the 'v' with proper tone mark
	    				case 1: //first tone
	    					syllable = syllable.replaceFirst("\u00FC", "\u01D6");
	    					break;
	    				case 2: //second tone
	    					syllable = syllable.replaceFirst("\u00FC", "\u01D8");
	    					break;
	    				case 3: //third tone
	    					syllable = syllable.replaceFirst("\u00FC", "\u01DA");
	    					break;
	    				case 4: //fourth tone
	    					syllable = syllable.replaceFirst("\u00FC", "\u01DC");
	    					break;
	    				default: //neutral tone
	    					break;
	    			}
    			}
    			//else if 'u' is the last character in syllable
    			else if(location_of_u > location_of_o && location_of_u > location_of_i){    			
	    			switch (tone){
	    				//will replace an 'u' with the 'u' with proper tone mark
	    				case 1: //first tone
	    					syllable = syllable.replaceFirst("u", "\u016B");
	    					break;
	    				case 2: //second tone
	    					syllable = syllable.replaceFirst("u", "\u00FA");
	    					break;
	    				case 3: //third tone
	    					syllable = syllable.replaceFirst("u", "\u01D4");
	    					break;
	    				case 4: //fourth tone
	    					syllable = syllable.replaceFirst("u", "\u00F9");
	    					break;
	    				default: //neutral tone
	    					break;
	    			}
    			}
    			//else if there is an 'O'
    			else if(location_of_u == -1 && location_of_i == -1 && location_of_o ==-1 &&
    					syllable.indexOf('O') != -1){
    				switch (tone){
	    				//will replace an 'O' with the 'O' with proper tone mark
	    				case 1: //first tone
	    					syllable = syllable.replaceFirst("O", "\u014C");
	    					break;
	    				case 2: //second tone
	    					syllable = syllable.replaceFirst("O", "\u00D3");
	    					break;
	    				case 3: //third tone
	    					syllable = syllable.replaceFirst("O", "\u01D1");
	    					break;
	    				case 4: //fourth tone
	    					syllable = syllable.replaceFirst("O", "\u00D2");
	    					break;
	    				default: //neutral tone
	    					break;
	    			}//end switch    				
    			}//end else if
    		}//end else    		
    		//removes trailing number from syllable.
    		syllable = syllable.substring(0, syllable.length()-1);
    		
    		//if syllable has 'v' remove trailing ':'
    		if(syllable.indexOf(':') != -1)
    			syllable = syllable.substring(0, syllable.length()-1);
    		
    		string_to_fix = syllable;
    	}    	
    	return string_to_fix;
    }
    
    public Vector<String> getCompoudCharacterWords(Vector<Character> characters){
    	Vector<String> dictionary_entries = new Vector<String>();
    	BufferedReader reader;
        String line;
        String hanzi_sequence = "";
    	
    	for(Character next_character : characters)
    		hanzi_sequence += next_character.traditional;
        
    	try{
	    	FileInputStream fis = new FileInputStream(cedict_filename);
	        InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
	        reader=new BufferedReader(isr);
	        
	        while((line=reader.readLine())!=null) {
	        	String line_hanzi = line.substring(0, line.indexOf('['));
	        	if (line_hanzi.contains(hanzi_sequence)){
	        		String formatted_line = "";
	        		//hanzi
	        		formatted_line += line.substring(0, line.indexOf('['));
	        		//pronunciation
	        		formatted_line += line.substring(line.indexOf('[')+1, line.indexOf(']')) + " ";
	        		//definition
	        		formatted_line += line.substring(line.indexOf('/')+1,line.lastIndexOf('/'));
	        		//System.out.println(formatted_line);
	        		dictionary_entries.add(formatted_line);
	        	}
	        		
	        }
    	 }catch (Exception e) {
             System.out.println(e.getMessage());
    	 }

        return dictionary_entries;
    }
    
    public Vector<Character> FindCharacterByPinyin(Vector<String> pinyin){
    	Vector<Character> character_matches = new Vector<Character>();
    	
	    for(String this_pinyin : pinyin)
	    {
		    for(Character current_char : Characters){
		    	if(current_char.mandarin.equalsIgnoreCase(this_pinyin))
		    		character_matches.add(current_char);
		    }
		}
	    
    	return character_matches;
    }
    
    public Vector<Character> FindCharacterByDefinition(String definition){
    	Vector<Character> character_matches = new Vector<Character>();
    	
	    for(Character current_char : Characters){
		   	if(current_char.definition.equalsIgnoreCase(definition))
		   		character_matches.add(current_char);
		}	    
    	return character_matches;
    }
    
    public Vector<String> FindUnidictEntryByDefinition(String definition){
    	Vector<String> dictionary_entries = new Vector<String>();
    	BufferedReader reader;
        String line;
        
    	try{
	    	FileInputStream fis = new FileInputStream(cedict_filename);
	        InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
	        reader=new BufferedReader(isr);
	        
	        while((line=reader.readLine())!=null) {
	        	String line_definition = line.substring(line.indexOf('/')+1,line.lastIndexOf('/'));
	        	if (line_definition.contains(definition)){
	        		String formatted_line = "";
	        		//hanzi
	        		formatted_line += line.substring(0, line.indexOf('['));
	        		//pronunciation
	        		formatted_line += line.substring(line.indexOf('[')+1, line.indexOf(']')) + " ";
	        		//definition
	        		formatted_line += line.substring(line.indexOf('/')+1,line.lastIndexOf('/'));
	        		//System.out.println(formatted_line);
	        		dictionary_entries.add(formatted_line);
	        	}
	        		
	        }
    	 }catch (Exception e) {
             System.out.println(e.getMessage());
    	 }

        return dictionary_entries;
    }
    
}