/*
 * Copyright 2005 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.wamblee.crawler.impl;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Element;
import org.dom4j.io.DOMReader;
import org.w3c.dom.Document;
import org.wamblee.crawler.Configuration;
import org.wamblee.crawler.Crawler;
import org.wamblee.crawler.Page;
import org.wamblee.crawler.PageException;
import org.wamblee.crawler.PageRequest;
import org.wamblee.crawler.PageType;

/**
 * Crawler implementation.
 */
public class CrawlerImpl implements Crawler {

    private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);

    private HttpClient _client;

    private Configuration _config;

    /**
     * Constructs the crawler.
     * 
     * @param aClient
     *            Http client to use.
     * @param aConfig
     *            Configuration.
     */
    public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
        _client = aClient;
        _config = aConfig;
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
     */
    public Page getPage(String aUrl) throws PageException {
        LOG.debug("Getting page: url = '" + aUrl + "'");
        PageRequest request = _config.getRequest(aUrl);
        Document content = request.execute(aUrl, _client);
        return transformToDom4jDoc(content);
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
     *      java.lang.String)
     */
    public Page getPage(String aUrl, PageType aType) throws PageException {
        LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
        PageRequest request = _config.getRequest(aType);
        Document content = request.execute(aUrl, _client);
        return transformToDom4jDoc(content);
    }

    /**
     * Converts a w3c DOM document to a page object. 
     * @param content DOM document. 
     * @return
     */
    private Page transformToDom4jDoc(Document content) {
        DOMReader reader = new DOMReader();
        org.dom4j.Document dom4jDoc = reader.read(content);
        Element root = dom4jDoc.getRootElement();
        dom4jDoc.remove(root);

        return new PageImpl(this, replaceReferencesWithContent(root));
    }

    /**
     * Perform crawling. Find references in the retrieved content and replace
     * them by the content they refer to by retrieving the appropriate pages as
     * well.
     * 
     * @param content
     *            Content which must be made complete.
     * @return Fully processed content.
     */
    private Element replaceReferencesWithContent(Element content) {
        return content; // TODO implement.
    }
}