/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.impl; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Element; import org.dom4j.io.DOMReader; import org.w3c.dom.Document; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; /** * Crawler implementation. */ public class CrawlerImpl implements Crawler { private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); private static final int MAX_DELAY = 5000; private HttpClient _client; private Configuration _config; public CrawlerImpl(HttpClient aClient, Configuration aConfig) { _client = aClient; _config = aConfig; } /* * (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ public Page getPage(String aUrl) throws PageException { LOG.info("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); Document content = request.execute(aUrl, _client); return transformToDom4jDoc(content); } /* (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) */ public Page getPage(String aUrl, PageType aType) throws PageException { LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); Document content = request.execute(aUrl, _client); return transformToDom4jDoc(content); } /** * @param aUrl * @param request * @return */ private Page transformToDom4jDoc(Document content) { DOMReader reader = new DOMReader(); org.dom4j.Document dom4jDoc = reader.read(content); Element root = dom4jDoc.getRootElement(); dom4jDoc.remove(root); return new PageImpl(this, replaceReferencesWithContent(root)); } /** * Perform crawling. Find references in the retrieved content and replace them * by the content they refer to by retrieving the appropriate pages as well. * @param content Content which must be made complete. * @return Fully processed content. */ private Element replaceReferencesWithContent(Element content) { return content; // TODO implement. } }