import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
/**
* Gets the parameters for the request.
*
+ * @param aParams Additional parameters to use, obtained from another page, most likely as
+ * hidden form fields.
* @return Request parameters.
*/
- protected NameValuePair[] getParameters() {
- return _params;
+ protected NameValuePair[] getParameters(NameValuePair[] aParams) {
+ List<NameValuePair> params = new ArrayList<NameValuePair>();
+ params.addAll(Arrays.asList(_params));
+ params.addAll(Arrays.asList(aParams));
+ return params.toArray(new NameValuePair[0]);
}
/**
try {
aMethod = executeWithRedirects(aClient, aMethod);
byte[] xhtmlData = getXhtml(aMethod);
+
Document transformed = _transformer.transform(xhtmlData,
_transformer.resolve(_xslt));
package org.wamblee.crawler;
+import org.apache.commons.httpclient.NameValuePair;
import org.dom4j.Element;
/**
package org.wamblee.crawler;
+import org.apache.commons.httpclient.NameValuePair;
+
/**
* The object that actually obtains pages based on URL.
/**
* Gets the content for a specific page.
* @param aUrl Url of page.
+ * @param aParameters Paremeters to supply.
* @return Page to retrieve.
* @throws PageException In case of problems retrieving the page.
*/
- Page getPage(String aUrl) throws PageException;
+ Page getPage(String aUrl, NameValuePair[] aParameters) throws PageException;
/**
* Gets the content for a specific page.
- * @param aUrl Url of page.
+ * @param aUrl Url of page.
+ * @param aParameters Parameters to supply.
* @param aType Type of page.
* @return Page.
* @throws PageException In case of problems retrieving the page.
*/
- Page getPage(String aUrl, PageType aType) throws PageException;
+ Page getPage(String aUrl, NameValuePair[] aParameters, PageType aType) throws PageException;
}
*
* @see org.wamblee.crawler.PageRequest#getPage(org.apache.commons.httpclient.HttpClient)
*/
- public Document execute(String aUrl, HttpClient aClient)
+ public Document execute(String aUrl, NameValuePair[] aParams, HttpClient aClient)
throws PageException {
HttpMethod method = new GetMethod(aUrl);
- if (getParameters().length > 0) {
+ NameValuePair[] params = getParameters(aParams);
+ if (params.length > 0) {
String oldQueryString = method.getQueryString();
- method.setQueryString(getParameters());
+ method.setQueryString(params);
String queryString = method.getQueryString();
if (oldQueryString.length() > 0) {
queryString = queryString + '&' + oldQueryString;
package org.wamblee.crawler;
import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
import org.w3c.dom.Document;
/**
/**
* Gets a page as an XML document.
* @param aUrl Url of the page.
+ * @param aParams Additional parameters to supply.
* @param aClient Http client to use.
* @return Client.
* @throws PageException In case of problems retrieving the page.
*/
- Document execute(String aUrl, HttpClient aClient) throws PageException;
+ Document execute(String aUrl, NameValuePair[] aParams, HttpClient aClient) throws PageException;
/**
* Overrides the Xslt to use. This is used when the transformed page specifies
* @see org.wamblee.crawler.PageRequest#execute(java.lang.String,
* org.apache.commons.httpclient.HttpClient)
*/
- public Document execute(String aUrl, HttpClient aClient)
+ public Document execute(String aUrl, NameValuePair[] aParams, HttpClient aClient)
throws PageException {
PostMethod method = new PostMethod(aUrl);
- method.addParameters(getParameters());
+ method.addParameters(getParameters(aParams));
try {
return executeMethod(aClient, method);
} catch (TransformerException e) {