wamblee.org Git - utils/blob - crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java

   1 /*
   2  * Copyright 2005 the original author or authors.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package org.wamblee.crawler.impl;
  18
  19 import org.apache.commons.httpclient.HttpClient;
  20 import org.apache.commons.logging.Log;
  21 import org.apache.commons.logging.LogFactory;
  22 import org.dom4j.Element;
  23 import org.dom4j.io.DOMReader;
  24 import org.w3c.dom.Document;
  25 import org.wamblee.crawler.Configuration;
  26 import org.wamblee.crawler.Crawler;
  27 import org.wamblee.crawler.Page;
  28 import org.wamblee.crawler.PageException;
  29 import org.wamblee.crawler.PageRequest;
  30 import org.wamblee.crawler.PageType;
  31
  32 /**
  33  * Crawler implementation.
  34  */
  35 public class CrawlerImpl implements Crawler {
  36
  37     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
  38
  39     private static final int MAX_DELAY = 5000;
  40
  41     private HttpClient _client;
  42
  43     private Configuration _config;
  44
  45     /**
  46      * Constructs the crawler.
  47      *
  48      * @param aClient
  49      *            Http client to use.
  50      * @param aConfig
  51      *            Configuration.
  52      */
  53     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
  54         _client = aClient;
  55         _config = aConfig;
  56     }
  57
  58     /*
  59      * (non-Javadoc)
  60      *
  61      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
  62      */
  63     public Page getPage(String aUrl) throws PageException {
  64         LOG.info("Getting page: url = '" + aUrl + "'");
  65         PageRequest request = _config.getRequest(aUrl);
  66         Document content = request.execute(aUrl, _client);
  67         return transformToDom4jDoc(content);
  68     }
  69
  70     /*
  71      * (non-Javadoc)
  72      *
  73      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
  74      *      java.lang.String)
  75      */
  76     public Page getPage(String aUrl, PageType aType) throws PageException {
  77         LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
  78         PageRequest request = _config.getRequest(aType);
  79         Document content = request.execute(aUrl, _client);
  80         return transformToDom4jDoc(content);
  81     }
  82
  83     /**
  84      * Converts a w3c DOM document to a page object.
  85      * @param content DOM document.
  86      * @return
  87      */
  88     private Page transformToDom4jDoc(Document content) {
  89         DOMReader reader = new DOMReader();
  90         org.dom4j.Document dom4jDoc = reader.read(content);
  91         Element root = dom4jDoc.getRootElement();
  92         dom4jDoc.remove(root);
  93
  94         return new PageImpl(this, replaceReferencesWithContent(root));
  95     }
  96
  97     /**
  98      * Perform crawling. Find references in the retrieved content and replace
  99      * them by the content they refer to by retrieving the appropriate pages as
 100      * well.
 101      *
 102      * @param content
 103      *            Content which must be made complete.
 104      * @return Fully processed content.
 105      */
 106     private Element replaceReferencesWithContent(Element content) {
 107         return content; // TODO implement.
 108     }
 109 }