/*
Copyright (C) 2000-2010  Ministere de la culture et de la communication (France), AJLSM
See LICENCE file
*/
/*
 * Created by IntelliJ IDEA.
 * User: rpandey
 * Date: 5 nov. 2002
 * Time: 11:39:14
 * To change template for new class use
 * Code Style | Class Templates options (Tools | IDE Options).
 */
package fr.gouv.culture.sdx.search.lucene.analysis.filter;

import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

/**
 * Title: ChineseFilter
 * Description: Filter with a stop word table
 *              Rule: No digital is allowed.
 *                    English word/token should larger than 1 character.
 *                    One Chinese character as one Chinese word.
 * TO DO:
 *   1. Add Chinese stop words, such as \ue400
 *   2. Dictionary based Chinese word extraction
 *   3. Intelligent Chinese word extraction
 *
 * Copyright:    Copyright (c) 2001
 * Company:
 * @author Yiyi Sun
 * @version 1.0
 * @deprecated use {@link org.apache.lucene.analysis.cn.ChineseFilter} instead
 */

public final class ChineseFilter extends TokenFilter {


    // Only English now, Chinese to be added later.
    /**
     * 
     */
    public static final String[] STOP_WORDS = {
        "and", "are", "as", "at", "be", "but", "by",
        "for", "if", "in", "into", "is", "it",
        "no", "not", "of", "on", "or", "such",
        "that", "the", "their", "then", "there", "these",
        "they", "this", "to", "was", "will", "with"
    };


    
    //private Hashtable stopTable;
    private Set stopTable;

    /**
     * @param in
     */
    public ChineseFilter(TokenStream in) {
        super(in);

//        stopTable = new Hashtable(STOP_WORDS.length);
        stopTable = new HashSet(STOP_WORDS.length);
        for (int i = 0; i < STOP_WORDS.length; i++)
//            stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
        	stopTable.add(STOP_WORDS[i]);
    }
    
    /**
     * @param in
     * @param stopWords
     */
    public ChineseFilter(TokenStream in, Set stopWords) {
        super(in);
        stopTable = stopWords;
        if (stopTable == null)
            stopTable = new HashSet();
    }

    /**
     * @param in
     * @param stopWords
     */
    public ChineseFilter(TokenStream in, Hashtable stopWords) {
        super(in);
        stopTable = stopWords.keySet();
        if (stopTable == null)
            stopTable = new HashSet();
    }

    public final Token next() throws java.io.IOException {

    	String text = null;
        for (Token token = input.next(); token != null; token = input.next()) {
            text = token.termText();

            if (!stopTable.contains(text)) {
                switch (Character.getType(text.charAt(0))) {

                    case Character.LOWERCASE_LETTER:
                    case Character.UPPERCASE_LETTER:

                        // English word/token should larger than 1 character.
                        if (text.length() > 1) {
                            return token;
                        }
                        break;
                    case Character.OTHER_LETTER:

                        // One Chinese character as one Chinese word.
                        // Chinese word extraction to be added later here.

                        return token;
                }

            }

        }
        return null;
    }

}