/*
SDX: Documentary System in XML.
Copyright (C) 2000, 2001, 2003  Ministere de la culture et de la communication (France), AJLSM

Ministere de la culture et de la communication,
Mission de la recherche et de la technologie
3 rue de Valois, 75042 Paris Cedex 01 (France)
mrt@culture.fr, michel.bottin@culture.fr

AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
sevigny@ajlsm.com

Pierrick Brihaye, 2003
pierrick.brihaye@wanadoo.fr

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
 */

package fr.gouv.culture.sdx.search.lucene.analysis;

import gpl.pierrick.brihaye.aramorph.lucene.ArabicGlosser;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicGrammaticalFilter;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicTokenizer;
import gpl.pierrick.brihaye.aramorph.lucene.WhitespaceFilter;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.Logger;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;

import java.io.Reader;

/** An english glosser for the arabic language. This glosser uses Tim Buckwalter's algorithm
 * (available at <a href="http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002L49">LDC
 * Catalog</a>) to identify the morphological category of arabic tokens and then return their glosses.
 * The meaningful morphological categories are still to be determined but the current list gives
 * good results.
 * @author Pierrick Brihaye, 2003
 */
public final class Glosser_ar_en extends AbstractAnalyzer {

    /** An array containing some common english words that are usually not
     * useful for searching. */
    public static final String[] STOP_WORDS = {
        "a", "and", "are", "as", "at", "be", "but", "by",
        "for", "if", "in", "into", "is", "it",
        "no", "not", "of", "on", "or", "s", "such",
        "t", "that", "the", "their", "then", "there", "these",
        "they", "this", "to", "was", "will", "with"
    };

    /** Configure the glosser.
     * @param configuration The configuration object
     * @throws ConfigurationException If a problem occurs during configuration
     */
    public void configure(Configuration configuration) throws ConfigurationException {
        super.configure(configuration);
    }

    /** Transmits a logger to the class.
     * @param logger The logger
     */
    public void enableLogging(Logger logger) {
        super.enableLogging(logger);
    }

    /** Returns a token stream of glosses of arabic words whose morphological categories are found to be semantically meaningful.
     * @return The token stream
     * @param reader The reader
     */
    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream result = null;
        try {
            result = new ArabicTokenizer(reader);
            result = new ArabicGlosser(result);
            result = new ArabicGrammaticalFilter(result);
            result = new WhitespaceFilter(result);
            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(result, StopFilter.makeStopTable(STOP_WORDS));
        } catch (Exception e) {
            this.logger.error("Arabic glosser error", e);
        }
        return result;
    }


}



