package org.wamblee.crawler.impl;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.httpclient.NameValuePair;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.XPath;
private static final String ATT_HREF = "reference";
private static final String ATT_TYPE = "type";
+
+ private static final String ELEM_PARAM = "param";
+
+ private static final String ATT_VALUE = "value";
+ private String _href;
+
private Crawler _crawler;
private Element _content;
*
* @param aContent
*/
- public PageImpl(Crawler aCrawler, Element aContent) {
+ public PageImpl(String aHref, Crawler aCrawler, Element aContent) {
+ _href = aHref;
_crawler = aCrawler;
_content = aContent;
_actions = computeActions();
String name = elem.attributeValue(ATT_NAME);
String href = elem.attributeValue(ATT_HREF);
String type = elem.attributeValue(ATT_TYPE);
+ NameValuePair[] params = getMandatoryParameters(elem);
+ href = absolutizeHref(_href, href);
if (type == null) {
- names.add(new ActionImpl(_crawler, elem, name, href));
+ names.add(new ActionImpl(_crawler, elem, name, href, params));
} else {
names.add(new ActionImpl(_crawler, elem, name, href,
- new PageType(type)));
+ new PageType(type), params));
}
}
return names.toArray(new Action[0]);
}
+
+ /**
+ * Absolutize the hyperlink
+ * @param aPageHref Absolute page reference.
+ * @param aLinkHref Possibly relative link reference.
+ * @return Absolute hyperlink.
+ */
+ private String absolutizeHref(String aPageHref, String aLinkHref) {
+
+ try {
+ URL pageUrl = new URL(aPageHref);
+ URL newUrl = new URL(pageUrl, aLinkHref);
+ return newUrl.toString(); // TODO need to use URL instead of String throughout the code.
+ } catch (MalformedURLException e) {
+ throw new RuntimeException("Malformed URL", e);
+ }
+ }
+
+ private NameValuePair[] getMandatoryParameters(Element aAction) {
+ List<NameValuePair> result = new ArrayList<NameValuePair>();
+ for (Element param: (List<Element>)aAction.elements(ELEM_PARAM)) {
+ String name = param.attributeValue(ATT_NAME);
+ String value = param.attributeValue(ATT_VALUE);
+ result.add(new NameValuePair(name, value));
+ }
+ return result.toArray(new NameValuePair[0]);
+ }
/*
* (non-Javadoc)
return null;
}
if (results.size() > 1) {
- throw new RuntimeException("Duplicate link '" + aName + "'");
+ throw new RuntimeException("Duplicate action '" + aName + "'");
}
return results.get(0);
}