/*
Copyright (C) 2000-2010  Ministere de la culture et de la communication (France), AJLSM
See LICENCE file
*/
package fr.gouv.culture.sdx.search.lucene.analysis.filter;

import org.apache.avalon.framework.logger.LogEnabled;
import org.apache.avalon.framework.logger.Logger;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizerConstants;

/**
 * A filter for special french words.
 */
public class FrenchStandardFilter extends TokenFilter implements StandardTokenizerConstants, LogEnabled {

    /** Avalon super.getLog() to write information. */
	// FIXME : private field never used locally [MP]
    private org.apache.avalon.framework.logger.Logger logger;

    private static final String APOSTROPHE_TYPE = tokenImage[APOSTROPHE];
    private static final String ACRONYM_TYPE = tokenImage[ACRONYM];

    /**Builds a new filter
     * @param in 
     *
     */
    // MAJ Lucene 2.1.0
    //public FrenchStandardFilter() {
    public FrenchStandardFilter(TokenStream in) {
    	super(in);
    }

    /**
     *	Builds a filter upon a token stream.
     *
     *	@param	in	The input token stream.
     */
    public void setUp(TokenStream in) {
        // Just keep track of the input stream.
        this.input = in;
    }

    /**
     * Sets the super.getLog().
     *
     * @param   logger      The super.getLog() to use.
     */
    public void enableLogging(Logger logger) {
        this.logger = logger;
    }

    /**
     * Returns the next token in the stream, or null at EOS.
     * <p>
     * Operations performed :
     * <ul>
     * <li>Removes 's in expressions such as l'ecole
     * <li>Remove dots from acronyms
     * </ul>
     */
    public Token next() throws java.io.IOException {
        Token t = input.next();
        if (t == null) return null;

        String text = t.termText();
        String type = t.type();

        if (type == APOSTROPHE_TYPE) {
            int idx = text.indexOf("'");
            if (idx < 2)  // Removes if the apostrophe is at beginning.
            {
                if (text.length() > idx + 3)
                    return new Token(text.substring(idx + 1), t.startOffset(), t.endOffset(), type);
                else
                    return t;
                // BUG: shoud return null, but with Lucene 1 all following words are deleted!
            } else
                return t;
        } else {
            if (type == ACRONYM_TYPE) {
                // Remove dots
                StringBuffer trimmed = new StringBuffer();
                for (int i = 0; i < text.length(); i++) {
                    char c = text.charAt(i);
                    if (c != '.') trimmed.append(c);
                }
                return new Token(trimmed.toString(), t.startOffset(), t.endOffset(), type);
            } else
                return t;
        }
    }

}
