From 5685a836b9208ff8babfe5ac5b30c5f86d27cf96 Mon Sep 17 00:00:00 2001 From: erik Date: Sat, 19 Aug 2006 23:52:53 +0000 Subject: [PATCH] support for parameters on actions. --- .../org/wamblee/crawler/AbstractPageRequest.java | 13 +++++++++++-- crawler/basic/src/org/wamblee/crawler/Action.java | 1 + crawler/basic/src/org/wamblee/crawler/Crawler.java | 10 +++++++--- .../src/org/wamblee/crawler/GetPageRequest.java | 7 ++++--- .../basic/src/org/wamblee/crawler/PageRequest.java | 4 +++- .../src/org/wamblee/crawler/PostPageRequest.java | 4 ++-- 6 files changed, 28 insertions(+), 11 deletions(-) diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 28482d7f..2e598005 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -18,6 +18,9 @@ package org.wamblee.crawler; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -108,10 +111,15 @@ public abstract class AbstractPageRequest implements PageRequest { /** * Gets the parameters for the request. * + * @param aParams Additional parameters to use, obtained from another page, most likely as + * hidden form fields. * @return Request parameters. */ - protected NameValuePair[] getParameters() { - return _params; + protected NameValuePair[] getParameters(NameValuePair[] aParams) { + List params = new ArrayList(); + params.addAll(Arrays.asList(_params)); + params.addAll(Arrays.asList(aParams)); + return params.toArray(new NameValuePair[0]); } /** @@ -176,6 +184,7 @@ public abstract class AbstractPageRequest implements PageRequest { try { aMethod = executeWithRedirects(aClient, aMethod); byte[] xhtmlData = getXhtml(aMethod); + Document transformed = _transformer.transform(xhtmlData, _transformer.resolve(_xslt)); diff --git a/crawler/basic/src/org/wamblee/crawler/Action.java b/crawler/basic/src/org/wamblee/crawler/Action.java index cd9b4e2a..f24cacd0 100644 --- a/crawler/basic/src/org/wamblee/crawler/Action.java +++ b/crawler/basic/src/org/wamblee/crawler/Action.java @@ -16,6 +16,7 @@ package org.wamblee.crawler; +import org.apache.commons.httpclient.NameValuePair; import org.dom4j.Element; /** diff --git a/crawler/basic/src/org/wamblee/crawler/Crawler.java b/crawler/basic/src/org/wamblee/crawler/Crawler.java index 00d1283a..3615d9bc 100644 --- a/crawler/basic/src/org/wamblee/crawler/Crawler.java +++ b/crawler/basic/src/org/wamblee/crawler/Crawler.java @@ -16,6 +16,8 @@ package org.wamblee.crawler; +import org.apache.commons.httpclient.NameValuePair; + /** * The object that actually obtains pages based on URL. @@ -25,17 +27,19 @@ public interface Crawler { /** * Gets the content for a specific page. * @param aUrl Url of page. + * @param aParameters Paremeters to supply. * @return Page to retrieve. * @throws PageException In case of problems retrieving the page. */ - Page getPage(String aUrl) throws PageException; + Page getPage(String aUrl, NameValuePair[] aParameters) throws PageException; /** * Gets the content for a specific page. - * @param aUrl Url of page. + * @param aUrl Url of page. + * @param aParameters Parameters to supply. * @param aType Type of page. * @return Page. * @throws PageException In case of problems retrieving the page. */ - Page getPage(String aUrl, PageType aType) throws PageException; + Page getPage(String aUrl, NameValuePair[] aParameters, PageType aType) throws PageException; } diff --git a/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java b/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java index 40a34211..b737723d 100644 --- a/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java @@ -50,12 +50,13 @@ public class GetPageRequest extends AbstractPageRequest { * * @see org.wamblee.crawler.PageRequest#getPage(org.apache.commons.httpclient.HttpClient) */ - public Document execute(String aUrl, HttpClient aClient) + public Document execute(String aUrl, NameValuePair[] aParams, HttpClient aClient) throws PageException { HttpMethod method = new GetMethod(aUrl); - if (getParameters().length > 0) { + NameValuePair[] params = getParameters(aParams); + if (params.length > 0) { String oldQueryString = method.getQueryString(); - method.setQueryString(getParameters()); + method.setQueryString(params); String queryString = method.getQueryString(); if (oldQueryString.length() > 0) { queryString = queryString + '&' + oldQueryString; diff --git a/crawler/basic/src/org/wamblee/crawler/PageRequest.java b/crawler/basic/src/org/wamblee/crawler/PageRequest.java index 192f74e4..be729069 100644 --- a/crawler/basic/src/org/wamblee/crawler/PageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/PageRequest.java @@ -17,6 +17,7 @@ package org.wamblee.crawler; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; import org.w3c.dom.Document; /** @@ -27,11 +28,12 @@ public interface PageRequest { /** * Gets a page as an XML document. * @param aUrl Url of the page. + * @param aParams Additional parameters to supply. * @param aClient Http client to use. * @return Client. * @throws PageException In case of problems retrieving the page. */ - Document execute(String aUrl, HttpClient aClient) throws PageException; + Document execute(String aUrl, NameValuePair[] aParams, HttpClient aClient) throws PageException; /** * Overrides the Xslt to use. This is used when the transformed page specifies diff --git a/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java b/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java index af160f60..4a6a073c 100644 --- a/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java @@ -52,10 +52,10 @@ public class PostPageRequest extends AbstractPageRequest { * @see org.wamblee.crawler.PageRequest#execute(java.lang.String, * org.apache.commons.httpclient.HttpClient) */ - public Document execute(String aUrl, HttpClient aClient) + public Document execute(String aUrl, NameValuePair[] aParams, HttpClient aClient) throws PageException { PostMethod method = new PostMethod(aUrl); - method.addParameters(getParameters()); + method.addParameters(getParameters(aParams)); try { return executeMethod(aClient, method); } catch (TransformerException e) { -- 2.31.1