package org.wamblee.crawler.impl;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.httpclient.NameValuePair;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.XPath;
private static final String ATT_HREF = "reference";
private static final String ATT_TYPE = "type";
+
+ private static final String ELEM_PARAM = "param";
+
+ private static final String ATT_VALUE = "value";
+ private String _href;
+
private Crawler _crawler;
private Element _content;
-
- private Action[] _actions;
+
+ private Action[] _actions;
/**
* Constructs a page.
*
* @param aContent
*/
- public PageImpl(Crawler aCrawler, Element aContent) {
+ public PageImpl(String aHref, Crawler aCrawler, Element aContent) {
+ _href = aHref;
_crawler = aCrawler;
_content = aContent;
_actions = computeActions();
}
-
+
/*
* (non-Javadoc)
*
String name = elem.attributeValue(ATT_NAME);
String href = elem.attributeValue(ATT_HREF);
String type = elem.attributeValue(ATT_TYPE);
- if (type == null ) {
- names.add(new ActionImpl(_crawler, elem, name, href));
- }
- else {
- names.add(new ActionImpl(_crawler, elem, name, href, new PageType(type)));
+ NameValuePair[] params = getMandatoryParameters(elem);
+ href = absolutizeHref(_href, href);
+ if (type == null) {
+ names.add(new ActionImpl(_crawler, elem, name, href, params));
+ } else {
+ names.add(new ActionImpl(_crawler, elem, name, href,
+ new PageType(type), params));
}
}
return names.toArray(new Action[0]);
}
+
+ /**
+ * Absolutize the hyperlink
+ * @param aPageHref Absolute page reference.
+ * @param aLinkHref Possibly relative link reference.
+ * @return Absolute hyperlink.
+ */
+ private String absolutizeHref(String aPageHref, String aLinkHref) {
+
+ try {
+ URL pageUrl = new URL(aPageHref);
+ URL newUrl = new URL(pageUrl, aLinkHref);
+ return newUrl.toString(); // TODO need to use URL instead of String throughout the code.
+ } catch (MalformedURLException e) {
+ throw new RuntimeException("Malformed URL", e);
+ }
+ }
+
+ private NameValuePair[] getMandatoryParameters(Element aAction) {
+ List<NameValuePair> result = new ArrayList<NameValuePair>();
+ for (Element param: (List<Element>)aAction.elements(ELEM_PARAM)) {
+ String name = param.attributeValue(ATT_NAME);
+ String value = param.attributeValue(ATT_VALUE);
+ result.add(new NameValuePair(name, value));
+ }
+ return result.toArray(new NameValuePair[0]);
+ }
/*
* (non-Javadoc)
return _content;
}
- /* (non-Javadoc)
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Page#getActions()
*/
public Action[] getActions() {
return _actions;
}
-
+
/*
- * (non-Javadoc)
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Page#getAction(java.lang.String)
*/
public Action getAction(String aName) {
List<Action> results = new ArrayList<Action>();
- for (Action action: _actions) {
- if ( action.getName().equals(aName)) {
+ for (Action action : _actions) {
+ if (action.getName().equals(aName)) {
results.add(action);
}
}
return null;
}
if (results.size() > 1) {
- throw new RuntimeException("Duplicate link '" + aName + "'");
+ throw new RuntimeException("Duplicate action '" + aName + "'");
}
return results.get(0);
}