From b4555b9b3548c23244f85aaad3b92a471af63b46 Mon Sep 17 00:00:00 2001 From: Erik Brakkee Date: Sat, 19 Aug 2006 23:53:06 +0000 Subject: [PATCH] support for parameters on actions. --- .../org/wamblee/crawler/impl/ActionImpl.java | 15 +++++-- .../src/org/wamblee/crawler/impl/App.java | 3 +- .../org/wamblee/crawler/impl/CrawlerImpl.java | 17 +++---- .../org/wamblee/crawler/impl/PageImpl.java | 45 +++++++++++++++++-- 4 files changed, 64 insertions(+), 16 deletions(-) diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java index 0ff4252d..ab7068cb 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java @@ -16,6 +16,7 @@ package org.wamblee.crawler.impl; +import org.apache.commons.httpclient.NameValuePair; import org.dom4j.Element; import org.wamblee.crawler.Action; import org.wamblee.crawler.Crawler; @@ -37,6 +38,8 @@ public class ActionImpl implements Action { private String _reference; private PageType _type; + + private NameValuePair[] _parameters; /** * Constructs the action. @@ -50,14 +53,16 @@ public class ActionImpl implements Action { * Name of the action. * @param aReference * URL of the reference. + * @param aParameters Parameters to use for the action. */ public ActionImpl(Crawler aCrawler, Element aContent, String aName, - String aReference) { + String aReference, NameValuePair[] aParameters) { _crawler = aCrawler; _content = aContent; _name = aName; _reference = aReference; _type = null; + _parameters = aParameters; } /** @@ -74,14 +79,16 @@ public class ActionImpl implements Action { * URL of the reference. * @param aType * Type of the referenced page. + * @param aParameters Parameters to use. */ public ActionImpl(Crawler aCrawler, Element aContent, String aName, - String aReference, PageType aType) { + String aReference, PageType aType, NameValuePair[] aParameters) { _crawler = aCrawler; _content = aContent; _name = aName; _reference = aReference; _type = aType; + _parameters = aParameters; } /* @@ -100,9 +107,9 @@ public class ActionImpl implements Action { */ public Page execute() throws PageException { if (_type == null) { - return _crawler.getPage(_reference); + return _crawler.getPage(_reference, _parameters); } - return _crawler.getPage(_reference, _type); + return _crawler.getPage(_reference, _parameters, _type); } /* diff --git a/crawler/basic/src/org/wamblee/crawler/impl/App.java b/crawler/basic/src/org/wamblee/crawler/impl/App.java index f9b9bd45..b0339f4b 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/App.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/App.java @@ -5,6 +5,7 @@ import java.io.FileInputStream; import java.io.InputStream; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; import org.dom4j.Element; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; @@ -65,7 +66,7 @@ public final class App { Crawler crawler = new CrawlerImpl(client, config); System.out.println("Retrieving: " + starturl); - Page page = crawler.getPage(starturl); + Page page = crawler.getPage(starturl, new NameValuePair[0]); showPage(page); page = page.getAction("channels-favorites").execute(); recordInterestingShows(page); diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java index 0188ad31..098ed91f 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -17,6 +17,7 @@ package org.wamblee.crawler.impl; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Element; @@ -58,11 +59,11 @@ public class CrawlerImpl implements Crawler { * * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ - public Page getPage(String aUrl) throws PageException { + public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException { LOG.debug("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); - Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + Document content = request.execute(aUrl, aParams, _client); + return transformToDom4jDoc(aUrl, content); } /* @@ -71,11 +72,11 @@ public class CrawlerImpl implements Crawler { * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, * java.lang.String) */ - public Page getPage(String aUrl, PageType aType) throws PageException { + public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException { LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); - Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + Document content = request.execute(aUrl, aParams, _client); + return transformToDom4jDoc(aUrl, content); } /** @@ -83,13 +84,13 @@ public class CrawlerImpl implements Crawler { * @param content DOM document. * @return */ - private Page transformToDom4jDoc(Document content) { + private Page transformToDom4jDoc(String aUrl, Document content) { DOMReader reader = new DOMReader(); org.dom4j.Document dom4jDoc = reader.read(content); Element root = dom4jDoc.getRootElement(); dom4jDoc.remove(root); - return new PageImpl(this, replaceReferencesWithContent(root)); + return new PageImpl(aUrl, this, replaceReferencesWithContent(root)); } /** diff --git a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java index d478ed4b..1265bc29 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java @@ -16,9 +16,12 @@ package org.wamblee.crawler.impl; +import java.net.MalformedURLException; +import java.net.URL; import java.util.ArrayList; import java.util.List; +import org.apache.commons.httpclient.NameValuePair; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.dom4j.XPath; @@ -39,7 +42,13 @@ public class PageImpl implements Page { private static final String ATT_HREF = "reference"; private static final String ATT_TYPE = "type"; + + private static final String ELEM_PARAM = "param"; + + private static final String ATT_VALUE = "value"; + private String _href; + private Crawler _crawler; private Element _content; @@ -51,7 +60,8 @@ public class PageImpl implements Page { * * @param aContent */ - public PageImpl(Crawler aCrawler, Element aContent) { + public PageImpl(String aHref, Crawler aCrawler, Element aContent) { + _href = aHref; _crawler = aCrawler; _content = aContent; _actions = computeActions(); @@ -70,15 +80,44 @@ public class PageImpl implements Page { String name = elem.attributeValue(ATT_NAME); String href = elem.attributeValue(ATT_HREF); String type = elem.attributeValue(ATT_TYPE); + NameValuePair[] params = getMandatoryParameters(elem); + href = absolutizeHref(_href, href); if (type == null) { - names.add(new ActionImpl(_crawler, elem, name, href)); + names.add(new ActionImpl(_crawler, elem, name, href, params)); } else { names.add(new ActionImpl(_crawler, elem, name, href, - new PageType(type))); + new PageType(type), params)); } } return names.toArray(new Action[0]); } + + /** + * Absolutize the hyperlink + * @param aPageHref Absolute page reference. + * @param aLinkHref Possibly relative link reference. + * @return Absolute hyperlink. + */ + private String absolutizeHref(String aPageHref, String aLinkHref) { + + try { + URL pageUrl = new URL(aPageHref); + URL newUrl = new URL(pageUrl, aLinkHref); + return newUrl.toString(); // TODO need to use URL instead of String throughout the code. + } catch (MalformedURLException e) { + throw new RuntimeException("Malformed URL", e); + } + } + + private NameValuePair[] getMandatoryParameters(Element aAction) { + List result = new ArrayList(); + for (Element param: (List)aAction.elements(ELEM_PARAM)) { + String name = param.attributeValue(ATT_NAME); + String value = param.attributeValue(ATT_VALUE); + result.add(new NameValuePair(name, value)); + } + return result.toArray(new NameValuePair[0]); + } /* * (non-Javadoc) -- 2.31.1