X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;h=0188ad31c94d2a8eb8a2d2e040fb7af3fd90f122;hb=8bdf7301b21a7824933fac2b75caf410b7dd5923;hp=8db31606fc476e419cde8a4055a5286ad8f2c324;hpb=30671b398473b876e5c42d063f0c8e169ad3163c;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java index 8db31606..0188ad31 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.impl; @@ -25,67 +25,83 @@ import org.w3c.dom.Document; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; /** - * Crawler implementation. + * Crawler implementation. */ public class CrawlerImpl implements Crawler { - + private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); - - private HttpClient _client; - private Configuration _config; - + + private HttpClient _client; + + private Configuration _config; + + /** + * Constructs the crawler. + * + * @param aClient + * Http client to use. + * @param aConfig + * Configuration. + */ public CrawlerImpl(HttpClient aClient, Configuration aConfig) { - _client = aClient; - _config = aConfig; + _client = aClient; + _config = aConfig; } /* - * (non-Javadoc) + * (non-Javadoc) + * * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ - public Page getPage(String aUrl) { - LOG.info("Getting page: url = '" + aUrl + "'"); + public Page getPage(String aUrl) throws PageException { + LOG.debug("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + return transformToDom4jDoc(content); } - - /* (non-Javadoc) - * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, + * java.lang.String) */ - public Page getPage(String aUrl, PageType aType) { - LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); + public Page getPage(String aUrl, PageType aType) throws PageException { + LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + return transformToDom4jDoc(content); } /** - * @param aUrl - * @param request + * Converts a w3c DOM document to a page object. + * @param content DOM document. * @return */ private Page transformToDom4jDoc(Document content) { - DOMReader reader = new DOMReader(); org.dom4j.Document dom4jDoc = reader.read(content); Element root = dom4jDoc.getRootElement(); dom4jDoc.remove(root); - + return new PageImpl(this, replaceReferencesWithContent(root)); } - + /** - * Perform crawling. Find references in the retrieved content and replace them - * by the content they refer to by retrieving the appropriate pages as well. - * @param content Content which must be made complete. - * @return Fully processed content. + * Perform crawling. Find references in the retrieved content and replace + * them by the content they refer to by retrieving the appropriate pages as + * well. + * + * @param content + * Content which must be made complete. + * @return Fully processed content. */ - private Element replaceReferencesWithContent(Element content) { - return content; // TODO implement. + private Element replaceReferencesWithContent(Element content) { + return content; // TODO implement. } }