/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.impl; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Element; import org.dom4j.io.DOMReader; import org.w3c.dom.Document; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; /** * Crawler implementation. */ public class CrawlerImpl implements Crawler { private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); private HttpClient _client; private Configuration _config; /** * Constructs the crawler. * * @param aClient * Http client to use. * @param aConfig * Configuration. */ public CrawlerImpl(HttpClient aClient, Configuration aConfig) { _client = aClient; _config = aConfig; } /* * (non-Javadoc) * * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ public Page getPage(String aUrl) throws PageException { LOG.debug("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); Document content = request.execute(aUrl, _client); return transformToDom4jDoc(content); } /* * (non-Javadoc) * * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, * java.lang.String) */ public Page getPage(String aUrl, PageType aType) throws PageException { LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); Document content = request.execute(aUrl, _client); return transformToDom4jDoc(content); } /** * Converts a w3c DOM document to a page object. * @param content DOM document. * @return */ private Page transformToDom4jDoc(Document content) { DOMReader reader = new DOMReader(); org.dom4j.Document dom4jDoc = reader.read(content); Element root = dom4jDoc.getRootElement(); dom4jDoc.remove(root); return new PageImpl(this, replaceReferencesWithContent(root)); } /** * Perform crawling. Find references in the retrieved content and replace * them by the content they refer to by retrieving the appropriate pages as * well. * * @param content * Content which must be made complete. * @return Fully processed content. */ private Element replaceReferencesWithContent(Element content) { return content; // TODO implement. } }