X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;fp=crawler%2Fbasic%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;h=098ed91f42ee30d072b2ce5d5339e37d3e8a2726;hb=62f165891f08ae532b5a794af11d7338a93f9a43;hp=0000000000000000000000000000000000000000;hpb=07cedd3f0730646ea35a7f668b3e1e872a4605d9;p=utils diff --git a/crawler/basic/src/main/java/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/main/java/org/wamblee/crawler/impl/CrawlerImpl.java new file mode 100644 index 00000000..098ed91f --- /dev/null +++ b/crawler/basic/src/main/java/org/wamblee/crawler/impl/CrawlerImpl.java @@ -0,0 +1,108 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Element; +import org.dom4j.io.DOMReader; +import org.w3c.dom.Document; +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.Crawler; +import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; +import org.wamblee.crawler.PageRequest; +import org.wamblee.crawler.PageType; + +/** + * Crawler implementation. + */ +public class CrawlerImpl implements Crawler { + + private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); + + private HttpClient _client; + + private Configuration _config; + + /** + * Constructs the crawler. + * + * @param aClient + * Http client to use. + * @param aConfig + * Configuration. + */ + public CrawlerImpl(HttpClient aClient, Configuration aConfig) { + _client = aClient; + _config = aConfig; + } + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) + */ + public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException { + LOG.debug("Getting page: url = '" + aUrl + "'"); + PageRequest request = _config.getRequest(aUrl); + Document content = request.execute(aUrl, aParams, _client); + return transformToDom4jDoc(aUrl, content); + } + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, + * java.lang.String) + */ + public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException { + LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); + PageRequest request = _config.getRequest(aType); + Document content = request.execute(aUrl, aParams, _client); + return transformToDom4jDoc(aUrl, content); + } + + /** + * Converts a w3c DOM document to a page object. + * @param content DOM document. + * @return + */ + private Page transformToDom4jDoc(String aUrl, Document content) { + DOMReader reader = new DOMReader(); + org.dom4j.Document dom4jDoc = reader.read(content); + Element root = dom4jDoc.getRootElement(); + dom4jDoc.remove(root); + + return new PageImpl(aUrl, this, replaceReferencesWithContent(root)); + } + + /** + * Perform crawling. Find references in the retrieved content and replace + * them by the content they refer to by retrieving the appropriate pages as + * well. + * + * @param content + * Content which must be made complete. + * @return Fully processed content. + */ + private Element replaceReferencesWithContent(Element content) { + return content; // TODO implement. + } +}