/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.clustering.carrot2;
import java.util.*;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.nutch.clustering.HitsCluster;
import org.apache.nutch.clustering.OnlineClusterer;
import org.apache.nutch.searcher.HitDetails;
import org.carrot2.core.*;
import org.carrot2.core.attribute.AttributeNames;
import com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
/**
* This plugin provides an implementation of {@link OnlineClusterer}
* extension using clustering components of the Carrot2 project
* (http://www.carrot2.org).
*/
public class Clusterer implements OnlineClusterer, Configurable {
/** Default language property name. */
private final static String CONF_PROP_DEFAULT_LANGUAGE =
"extension.clustering.carrot2.defaultLanguage";
public static final Log logger = LogFactory.getLog(Clusterer.class);
/** The Controller instance used for clustering */
private Controller controller;
/** Nutch configuration. */
private Configuration conf;
/**
* Default language for hits. English by default, but may be changed
* via a property in Nutch configuration.
*/
private LanguageCode defaultLanguage = LanguageCode.ENGLISH;
/**
* An empty public constructor for making new instances
* of the clusterer.
*/
public Clusterer() {
// Don't forget to call {@link #setConf(Configuration)}.
}
/**
* See {@link OnlineClusterer} for documentation.
*/
public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) {
if (this.controller == null) {
logger.error("initialize() not called.");
return new HitsCluster[0];
}
if (hitDetails == null) {
throw new ProcessingException("Hit details array must not be null.");
}
if (descriptions == null) {
throw new ProcessingException("Descriptions array must not be null.");
}
if (hitDetails.length != descriptions.length) {
throw new ProcessingException("Descriptions and hit details must be of the same length.");
}
// Prepare documents for Carrot2
final List documents = Lists.newArrayListWithCapacity(hitDetails.length);
for (int i = 0; i < descriptions.length; i++) {
final HitDetails hit = hitDetails[i];
final Document document = new Document(hit.getValue("title"),
descriptions[i], hit.getValue("url"));
// Try to set language
final String lang = hit.getValue("lang");
if (StringUtils.isNotBlank(lang)) {
final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
document.setLanguage(carrot2Language != null ? carrot2Language : defaultLanguage);
}
documents.add(document);
}
final Map attributes = Maps.newHashMap();
attributes.put(AttributeNames.DOCUMENTS, documents);
try {
// Perform clustering
final List carrotClusters = controller.process(attributes,
Lingo3GClusteringAlgorithm.class).getClusters();
final HitsCluster [] clusters = HitsClusterAdapter.adapt(carrotClusters, hitDetails);
return clusters;
} catch (ProcessingException e) {
throw new RuntimeException("Problems with the clustering.", e);
}
}
/**
* Implementation of {@link Configurable}
*/
public void setConf(Configuration conf) {
this.conf = conf;
// Configure default language and other component settings.
if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
// Change the default language.
final String defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE);
final LanguageCode languageCode = LanguageCode.forISOCode(defaultLanguage);
if (languageCode != null) {
this.defaultLanguage = languageCode;
}
}
if (logger.isInfoEnabled()) {
logger.info("Default language: " + defaultLanguage);
}
initialize();
}
/**
* Implementation of {@link Configurable}
*/
public Configuration getConf() {
return conf;
}
/**
* Initialize clustering processes and Carrot2 components.
*/
private synchronized void initialize() {
// Initialize language list, temporarily switching off logging
// of warnings. This is a bit of a hack, but we don't want to
// redistribute the entire Carrot2 distro and this prevents
// nasty ClassNotFound warnings.
final Logger c2Logger = Logger.getLogger("org.carrot2");
final Level original = c2Logger.getLevel();
c2Logger.setLevel(Level.ERROR);
c2Logger.setLevel(original);
// Initialize the controller.
controller = ControllerFactory.createPooling();
}
}