wamblee.org Git - utils/blob - trunk/crawler/basic/src/main/java/org/wamblee/crawler/impl/CrawlerImpl.java

   1 /*
   2  * Copyright 2005 the original author or authors.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package org.wamblee.crawler.impl;
  18
  19 import org.apache.commons.httpclient.HttpClient;
  20 import org.apache.commons.httpclient.NameValuePair;
  21 import org.apache.commons.logging.Log;
  22 import org.apache.commons.logging.LogFactory;
  23 import org.dom4j.Element;
  24 import org.dom4j.io.DOMReader;
  25 import org.w3c.dom.Document;
  26 import org.wamblee.crawler.Configuration;
  27 import org.wamblee.crawler.Crawler;
  28 import org.wamblee.crawler.Page;
  29 import org.wamblee.crawler.PageException;
  30 import org.wamblee.crawler.PageRequest;
  31 import org.wamblee.crawler.PageType;
  32
  33 /**
  34  * Crawler implementation.
  35  *
  36  * @author Erik Brakkee
  37  */
  38 public class CrawlerImpl implements Crawler {
  39
  40     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
  41
  42     private HttpClient _client;
  43
  44     private Configuration _config;
  45
  46     /**
  47      * Constructs the crawler.
  48      *
  49      * @param aClient
  50      *            Http client to use.
  51      * @param aConfig
  52      *            Configuration.
  53      */
  54     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
  55         _client = aClient;
  56         _config = aConfig;
  57     }
  58
  59     /*
  60      * (non-Javadoc)
  61      *
  62      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
  63      */
  64     public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
  65         LOG.debug("Getting page: url = '" + aUrl + "'");
  66         PageRequest request = _config.getRequest(aUrl);
  67         Document content = request.execute(aUrl, aParams, _client);
  68         return transformToDom4jDoc(aUrl, content);
  69     }
  70
  71     /*
  72      * (non-Javadoc)
  73      *
  74      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
  75      *      java.lang.String)
  76      */
  77     public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
  78         LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
  79         PageRequest request = _config.getRequest(aType);
  80         Document content = request.execute(aUrl, aParams, _client);
  81         return transformToDom4jDoc(aUrl, content);
  82     }
  83
  84     /**
  85      * Converts a w3c DOM document to a page object.
  86      * @param content DOM document.
  87      * @return
  88      */
  89     private Page transformToDom4jDoc(String aUrl, Document content) {
  90         DOMReader reader = new DOMReader();
  91         org.dom4j.Document dom4jDoc = reader.read(content);
  92         Element root = dom4jDoc.getRootElement();
  93         dom4jDoc.remove(root);
  94
  95         return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
  96     }
  97
  98     /**
  99      * Perform crawling. Find references in the retrieved content and replace
 100      * them by the content they refer to by retrieving the appropriate pages as
 101      * well.
 102      *
 103      * @param content
 104      *            Content which must be made complete.
 105      * @return Fully processed content.
 106      */
 107     private Element replaceReferencesWithContent(Element content) {
 108         return content; // TODO implement.
 109     }
 110 }