package org.wamblee.crawler.impl;
+import org.apache.commons.httpclient.NameValuePair;
import org.dom4j.Element;
import org.wamblee.crawler.Action;
import org.wamblee.crawler.Crawler;
private String _reference;
private PageType _type;
+
+ private NameValuePair[] _parameters;
/**
* Constructs the action.
* Name of the action.
* @param aReference
* URL of the reference.
+ * @param aParameters Parameters to use for the action.
*/
public ActionImpl(Crawler aCrawler, Element aContent, String aName,
- String aReference) {
+ String aReference, NameValuePair[] aParameters) {
_crawler = aCrawler;
_content = aContent;
_name = aName;
_reference = aReference;
_type = null;
+ _parameters = aParameters;
}
/**
* URL of the reference.
* @param aType
* Type of the referenced page.
+ * @param aParameters Parameters to use.
*/
public ActionImpl(Crawler aCrawler, Element aContent, String aName,
- String aReference, PageType aType) {
+ String aReference, PageType aType, NameValuePair[] aParameters) {
_crawler = aCrawler;
_content = aContent;
_name = aName;
_reference = aReference;
_type = aType;
+ _parameters = aParameters;
}
/*
*/
public Page execute() throws PageException {
if (_type == null) {
- return _crawler.getPage(_reference);
+ return _crawler.getPage(_reference, _parameters);
}
- return _crawler.getPage(_reference, _type);
+ return _crawler.getPage(_reference, _parameters, _type);
}
/*
import java.io.InputStream;
import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
import org.dom4j.Element;
import org.wamblee.crawler.Action;
import org.wamblee.crawler.Configuration;
Crawler crawler = new CrawlerImpl(client, config);
System.out.println("Retrieving: " + starturl);
- Page page = crawler.getPage(starturl);
+ Page page = crawler.getPage(starturl, new NameValuePair[0]);
showPage(page);
page = page.getAction("channels-favorites").execute();
recordInterestingShows(page);
package org.wamblee.crawler.impl;
import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Element;
*
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
*/
- public Page getPage(String aUrl) throws PageException {
+ public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
LOG.debug("Getting page: url = '" + aUrl + "'");
PageRequest request = _config.getRequest(aUrl);
- Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
}
/*
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
* java.lang.String)
*/
- public Page getPage(String aUrl, PageType aType) throws PageException {
+ public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
PageRequest request = _config.getRequest(aType);
- Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
}
/**
* @param content DOM document.
* @return
*/
- private Page transformToDom4jDoc(Document content) {
+ private Page transformToDom4jDoc(String aUrl, Document content) {
DOMReader reader = new DOMReader();
org.dom4j.Document dom4jDoc = reader.read(content);
Element root = dom4jDoc.getRootElement();
dom4jDoc.remove(root);
- return new PageImpl(this, replaceReferencesWithContent(root));
+ return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
}
/**
package org.wamblee.crawler.impl;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.httpclient.NameValuePair;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.XPath;
private static final String ATT_HREF = "reference";
private static final String ATT_TYPE = "type";
+
+ private static final String ELEM_PARAM = "param";
+
+ private static final String ATT_VALUE = "value";
+ private String _href;
+
private Crawler _crawler;
private Element _content;
*
* @param aContent
*/
- public PageImpl(Crawler aCrawler, Element aContent) {
+ public PageImpl(String aHref, Crawler aCrawler, Element aContent) {
+ _href = aHref;
_crawler = aCrawler;
_content = aContent;
_actions = computeActions();
String name = elem.attributeValue(ATT_NAME);
String href = elem.attributeValue(ATT_HREF);
String type = elem.attributeValue(ATT_TYPE);
+ NameValuePair[] params = getMandatoryParameters(elem);
+ href = absolutizeHref(_href, href);
if (type == null) {
- names.add(new ActionImpl(_crawler, elem, name, href));
+ names.add(new ActionImpl(_crawler, elem, name, href, params));
} else {
names.add(new ActionImpl(_crawler, elem, name, href,
- new PageType(type)));
+ new PageType(type), params));
}
}
return names.toArray(new Action[0]);
}
+
+ /**
+ * Absolutize the hyperlink
+ * @param aPageHref Absolute page reference.
+ * @param aLinkHref Possibly relative link reference.
+ * @return Absolute hyperlink.
+ */
+ private String absolutizeHref(String aPageHref, String aLinkHref) {
+
+ try {
+ URL pageUrl = new URL(aPageHref);
+ URL newUrl = new URL(pageUrl, aLinkHref);
+ return newUrl.toString(); // TODO need to use URL instead of String throughout the code.
+ } catch (MalformedURLException e) {
+ throw new RuntimeException("Malformed URL", e);
+ }
+ }
+
+ private NameValuePair[] getMandatoryParameters(Element aAction) {
+ List<NameValuePair> result = new ArrayList<NameValuePair>();
+ for (Element param: (List<Element>)aAction.elements(ELEM_PARAM)) {
+ String name = param.attributeValue(ATT_NAME);
+ String value = param.attributeValue(ATT_VALUE);
+ result.add(new NameValuePair(name, value));
+ }
+ return result.toArray(new NameValuePair[0]);
+ }
/*
* (non-Javadoc)