wamblee.org Git - utils/blob - trunk/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java

   1 /*
   2  * Copyright 2005 the original author or authors.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package org.wamblee.crawler.impl;
  18
  19 import org.apache.commons.httpclient.HttpClient;
  20 import org.apache.commons.logging.Log;
  21 import org.apache.commons.logging.LogFactory;
  22 import org.dom4j.Element;
  23 import org.dom4j.io.DOMReader;
  24 import org.w3c.dom.Document;
  25 import org.wamblee.crawler.Configuration;
  26 import org.wamblee.crawler.Crawler;
  27 import org.wamblee.crawler.Page;
  28 import org.wamblee.crawler.PageRequest;
  29 import org.wamblee.crawler.PageType;
  30
  31 /**
  32  * Crawler implementation.
  33  */
  34 public class CrawlerImpl implements Crawler {
  35
  36     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
  37
  38     private HttpClient _client;
  39     private Configuration _config;
  40
  41     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
  42         _client = aClient;
  43         _config = aConfig;
  44     }
  45
  46     /*
  47      *  (non-Javadoc)
  48      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
  49      */
  50     public Page getPage(String aUrl) {
  51         LOG.info("Getting page: url = '" + aUrl + "'");
  52         PageRequest request = _config.getRequest(aUrl);
  53         Document content = request.execute(aUrl, _client);
  54         return transformToDom4jDoc(content);
  55     }
  56
  57     /* (non-Javadoc)
  58      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
  59      */
  60     public Page getPage(String aUrl, PageType aType) {
  61         LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
  62         PageRequest request = _config.getRequest(aType);
  63         Document content = request.execute(aUrl, _client);
  64         return transformToDom4jDoc(content);
  65     }
  66
  67     /**
  68      * @param aUrl
  69      * @param request
  70      * @return
  71      */
  72     private Page transformToDom4jDoc(Document content) {
  73
  74         DOMReader reader = new DOMReader();
  75         org.dom4j.Document dom4jDoc = reader.read(content);
  76         Element root = dom4jDoc.getRootElement();
  77         dom4jDoc.remove(root);
  78
  79         return new PageImpl(this, replaceReferencesWithContent(root));
  80     }
  81
  82     /**
  83      * Perform crawling. Find references in the retrieved content and replace them
  84      * by the content they refer to by retrieving the appropriate pages as well.
  85      * @param content Content which must be made complete.
  86      * @return Fully processed content.
  87      */
  88     private Element replaceReferencesWithContent(Element content) {
  89         return content; // TODO implement.
  90     }
  91 }