/*
SDX: Documentary System in XML.
Copyright (C) 2000, 2001, 2002  Ministere de la culture et de la communication (France), AJLSM

Ministere de la culture et de la communication,
Mission de la recherche et de la technologie
3 rue de Valois, 75042 Paris Cedex 01 (France)
mrt@culture.fr, michel.bottin@culture.fr

AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
sevigny@ajlsm.com

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/
package fr.gouv.culture.sdx.search.lucene.analysis.filter;

import org.apache.avalon.framework.logger.LogEnabled;
import org.apache.avalon.framework.logger.Logger;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizerConstants;

/**
 * A filter for special french words.
 */
public class FrenchStandardFilter extends TokenFilter implements StandardTokenizerConstants, LogEnabled {

    /** Avalon super.getLog() to write information. */
    private org.apache.avalon.framework.logger.Logger logger;

    private static final String APOSTROPHE_TYPE = tokenImage[APOSTROPHE];
    private static final String ACRONYM_TYPE = tokenImage[ACRONYM];

    /**Builds a new filter
     *
     */
    public FrenchStandardFilter() {
    }

    /**
     *	Builds a filter upon a token stream.
     *
     *	@param	in	The input token stream.
     */
    public void setUp(TokenStream in) {
        // Just keep track of the input stream.
        this.input = in;
    }

    /**
     * Sets the super.getLog().
     *
     * @param   logger      The super.getLog() to use.
     */
    public void enableLogging(Logger logger) {
        this.logger = logger;
    }

    /**
     * Returns the next token in the stream, or null at EOS.
     * <p>
     * Operations performed :
     * <ul>
     * <li>Removes 's in expressions such as l'�cole
     * <li>Remove dots from acronyms
     * </ul>
     */
    public Token next() throws java.io.IOException {
        Token t = input.next();
        if (t == null) return null;

        String text = t.termText();
        String type = t.type();

        if (type == APOSTROPHE_TYPE) {
            int idx = text.indexOf("'");
            if (idx < 2)  // Removes if the apostrophe is at beginning.
            {
                if (text.length() > idx + 3)
                    return new Token(text.substring(idx + 1), t.startOffset(), t.endOffset(), type);
                else
                    return t;
                // BUG: shoud return null, but with Lucene 1 all following words are deleted!
            } else
                return t;
        } else {
            if (type == ACRONYM_TYPE) {
                // Remove dots
                StringBuffer trimmed = new StringBuffer();
                for (int i = 0; i < text.length(); i++) {
                    char c = text.charAt(i);
                    if (c != '.') trimmed.append(c);
                }
                return new Token(trimmed.toString(), t.startOffset(), t.endOffset(), type);
            } else
                return t;
        }
    }

}
